# R Options
options(stringsAsFactors=FALSE,
        citation_format="pandoc", 
        dplyr.summarise.inform=FALSE, 
        knitr.table.format="html",
        kableExtra_view_html=TRUE,
        future.globals.maxSize=+Inf,
        mc.cores=params$cores, 
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

# Python3 needed for clustering, umap, other python packages
# Path to binary will be automatically found
# Set manually if it does not work
reticulate_python3_path = unname(Sys.which("python3"))
Sys.setenv(RETICULATE_PYTHON=reticulate_python3_path)

# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for "leiden" clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat

# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix, sctransform, glmGamPoi (optional for speed but only available for R 4.0)
# Tables: kableExtra, DT
# Plots: ggsci
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast, limma (for a more efficient implementation of the Wilcoxon Rank Sum Test according to Seurat)
# Functional enrichment: enrichR
# Other: sessioninfo, cerebroApp, sceasy, ROpenSci/bibtex, knitcitations

# Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source="fold-hide",      # by default collapse code blocks
                      dev=c("png", "pdf"),           # create figures in png and pdf; the first device (png) will be used for HTML output
                      dev.args=list(png=list(type="cairo"),  # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
                                    pdf=list(bg="white")),     # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
                      dpi=96,                        # figure resolution
                      fig.retina=2                   # retina multiplier
)

# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
invisible(knitcitations::citep(citation("knitr")))
cat(paste0('<link rel="stylesheet" href="', params$path_to_git, '/css/style.css">'))

# Copy input parameters to internal parameter list
param = params
if (is.null(param$samples_to_drop )) param$samples_to_drop = c()
if (is.null(param$vars_to_regress)) param$vars_to_regress = c()
if (is.null(param$latent_vars)) param$latent_vars = c()

# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_io.R",
                        "functions_plotting.R",
                        "functions_analysis.R",
                        "functions_degs.R",
                        "functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))

# Debugging mode: 
switch (param$debugging_mode, 
        default_debugging=on_error_default_debugging(), 
        terminal_debugger=on_error_start_terminal_debugger(),
        print_traceback=on_error_just_print_traceback(),
        on_error_default_debugging())

# Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)

# Create output directories
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "marker_degs"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "annotation"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "export"), recursive=TRUE, showWarnings=FALSE)

# Path for figures in png and pdf format (trailing "/" is needed)
knitr::opts_chunk$set(fig.path=paste0(file.path(param$path_out, "figures"), "/"))

# Path for annotation
param$file_annot = file.path(param$path_out, "annotation", paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))

# Path for cell cycle genes file
param$file_cc_genes = file.path(param$path_out, "annotation", "cell_cycle_markers.xlsx")

# Do checks
error_messages = c()

# Check parameters and parse entries (e.g. numbers) so that they are valid
param = check_parameters_scrnaseq(param)
error_messages = c(error_messages, param[["error_messages"]])
# Check installed packages
error_messages = c(error_messages, check_installed_packages_scrnaseq())
# Check python
error_messages = c(error_messages, check_python())
# Check pandoc
error_messages = c(error_messages, check_pandoc())
# Check enrichR
error_messages = c(error_messages, check_enrichr(param$enrichr_dbs, param$enrichr_site))
# Check ensembl
error_messages = c(error_messages, check_ensembl(biomart="ensembl", 
                                                 dataset=param$mart_dataset, 
                                                 mirror=param$biomart_mirror, 
                                                 version=param$annot_version,
                                                 attributes=param$mart_attributes,
                                                 file_annot=param$file_annot,
                                                 file_cc_markers=param$file_cc_genes))

# Stop here if there are errors
if (length(error_messages)) stop(paste(c("", paste("*", error_messages)), collapse="\n"))

Read data

The workflow can be run for pre-processed 10x data, as well as for pre-processed SmartSeq-2 or other data that are represented by a simple table with transcript counts per gene and cell.

Read and print general metrics

Prior to this workflow, raw sequencing reads were mapped to the reference transcriptome. The resulting mapping statistics are printed below for a first estimation of sample quality.

Pre-processing of 10x data with Cell Ranger We use the 10x Genomics Cell Ranger software to map 10x sequencing reads. The result is a count matrix that contains the number of unique transcripts per gene (rows) and cell (columns). To save storage space, the matrix is converted into a condensed format and described by the following 3 files:
  • “features.tsv.gz”: Tabular file with gene annotation (Ensembl gene ID and the gene symbol) to identify genes
  • “barcodes.tsv.gz”: File with cell barcodes to identify cells
  • “matrix.mtx.gz”: A condensed version of the count matrix
Pre-processing of SmartSeq-2 data Sequencing reads from SmartSeq-2 (and other) experiments can be mapped with any suitable mapping software as long as a count matrix is generated that contains the number of mapped reads per gene (rows) and cell (columns). The first row of the count matrix should contain cell barcodes or names, the first column of the matrix should contain Ensembl gene IDs.
# Are metrics provided?
if (!all(is.na(param$path_data$stats))) { 

  # Loop through all samples and read metrics
  mapping_stats_list = list()
  for (i in 1:nrow(param$path_data)) {  
    if (!is.na(param$path_data$stats[i])) { 
      mapping_stats_list[[param$path_data$name[i]]] = Read10XMetrics(param$path_data$stats[i])
    } 
  }
  
  # Join all mapping stats tables
  mapping_stats = mapping_stats_list %>% purrr::reduce(dplyr::full_join, by="metric")
  rownames(mapping_stats) = mapping_stats[["metric"]]
  mapping_stats = mapping_stats %>% dplyr::select(-metric)
  colnames(mapping_stats) = names(mapping_stats_list)
 
  # Print table to HTML 
  knitr::kable(mapping_stats, align="l", caption="General metrics") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
  kableExtra::scroll_box(width="100%", fixed_thead=TRUE)
 
} else { 
  message("Mapping statistics cannot be shown. No valid file provided.")
}

× (Message)
Mapping statistics cannot be shown. No valid file provided.

Read gene annotation

We read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names.

# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
  annot_ensembl = read.delim(param$file_annot)
} else {
  annot_mart = suppressWarnings(GetBiomaRt(biomart="ensembl", 
                                           dataset=param$mart_dataset, 
                                           mirror=param$biomart_mirror, 
                                           version=param$annot_version))
  annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes, useCache=FALSE)
  write.table(annot_ensembl, file=param$file_annot, sep="\t", col.names=TRUE, row.names=FALSE, append=FALSE)
  message("Gene annotation file was created at: ", param$file_annot)
  # Note: depending on the attributes, there might be more than one row per gene
}

# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
  stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}

# Create translation tables
ensembl = param$annot_main["ensembl"]
symbol = param$annot_main["symbol"]
entrez = param$annot_main["entrez"]

# Ensembl id to gene symbol (new)
ensembl_to_symbol = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol[, symbol] = ifelse(nchar(ensembl_to_symbol[, symbol]) == 0, NA, ensembl_to_symbol[, symbol])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])

# Ensembl id to seurat-compatible unique rowname (new)
ensembl_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname[, symbol] = ifelse(nchar(ensembl_to_seurat_rowname[, symbol]) == 0, ensembl_to_seurat_rowname[, ensembl], ensembl_to_seurat_rowname[, symbol])
ensembl_to_seurat_rowname[, symbol] = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])

# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)

# Ensembl to Entrez
ensembl_to_entrez = unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez[, entrez] = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])

# Seurat-compatible unique rowname to Entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})

# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
annot_ensembl = annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
rownames(annot_ensembl) = annot_ensembl[, ensembl]
# Use biomart to translate human cell cycle genes to the species of interest and save them in a file
if (file.exists(param$file_cc_genes)) {
  # Load from file
  genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
  genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)
  
} else { 
  # Obtain from Ensembl
  # Note: both mart objects must point to the same mirror for biomarT::getLDS to work
  mart_human = suppressWarnings(GetBiomaRt(biomart="ensembl", 
                                           dataset="hsapiens_gene_ensembl", 
                                           mirror=param$biomart_mirror, 
                                           version=param$annot_version))
  mart_myspecies = suppressWarnings(GetBiomaRt(biomart="ensembl", 
                                               dataset=param$mart_dataset, 
                                               mirror=GetBiomaRtMirror(mart_human), 
                                               version=param$annot_version)) 
  
  # S phase marker
  genes_s = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"), 
                          filters="external_gene_name", 
                          values=Seurat::cc.genes.updated.2019$s.genes, 
                          mart=mart_human, 
                          attributesL=c("ensembl_gene_id", "external_gene_name"), 
                          martL=mart_myspecies, 
                          uniqueRows=TRUE)
  colnames(genes_s) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
  genes_s = genes_s %>% dplyr::arrange(Human_gene_name)
  
  # G2/M marker
  genes_g2m = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"), 
                            filters="external_gene_name", 
                            values=Seurat::cc.genes.updated.2019$g2m.genes, 
                            mart=mart_human, 
                            attributesL=c("ensembl_gene_id", "external_gene_name"), 
                            martL=mart_myspecies, 
                            uniqueRows=TRUE)
  colnames(genes_g2m) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
  genes_g2m = genes_g2m %>% dplyr::arrange(Human_gene_name)
  
  # Write to file
  openxlsx::write.xlsx(list(S_phase=genes_s,G2M_phase=genes_g2m), file=param$file_cc_genes)
}

# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))

Read scRNA-seq data

We next read the single-cell RNA-seq data into a Seurat object.
What is Seurat and which information is contained in a Seurat object?

Seurat is an R-package that is used for the analysis of single-cell RNA-seq data. We read pre-processed data as described above, and convert it to a count matrix in R. In the case of 10x data, the count matrix contains the number of unique RNA molecules (UMIs) per gene and cell. In the case of SmartSeq-2 data, the count matrix contains the number of reads per gene and cell.

In addition to the count matrix, the workflow stores additional information in the Seurat object, including but not limited to the normalized data, dimensionality reduction and cluster results.
# List of Seurat objects
sc = list()

datasets = param$path_data
for (i in seq(nrow(datasets))) {
  name = datasets[i, "name"]
  type = datasets[i, "type"]
  path = datasets[i, "path"]
  suffix = datasets[i, "suffix"]
  
  # Read 10X or smartseq2
  if (type == "10x") {
    # Read 10X sparse matrix into a Seurat object
    sc = c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
    
  } else if (type == "smartseq2") {
    # Read counts table into a Seurat object
    sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
  } 
}

# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
  sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
  sc[[i]][["orig.ident"]] = sample_names[i]
}

# Set up colors for samples and add them to the sc objects
sample_names = purrr::flatten_chr(purrr::map(sc, function(s) {
  nms = unique(as.character(s[[]][["orig.ident"]]))
  return(nms) 
}))
param$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
sc = purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")

# 
### Add conditions per sample here if necessary
#

sc
## $pbmc_10x
## An object of class Seurat 
## 33694 features across 4033 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)
## 
## $pbmc_smartseq2_sample1
## An object of class Seurat 
## 33694 features across 311 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)
# Remove contaminant cells if needed
if (!is.null(param$file_contam_cells) && file.exists(param$file_contam_cells)) {
  contam_cells = read.delim(param$file_contam_cells, header=FALSE)[,1]
  message("Read ", length(contam_cells), " names of contaminant cells.")

  n_contam_cells = 0
  for (i in seq(sc)) { 
    bool_contam_cells = colnames(sc[[i]]) %in% contam_cells
    n_contam_cells = n_contam_cells + sum(bool_contam_cells)
    names_contam_cells = colnames(sc[[i]])[!bool_contam_cells]
    sc[[i]] = subset(sc[[i]], cells=names_contam_cells)
  }
  message("Removed ", n_contam_cells, " contaminant cells from the dataset.")
  
  sc
}

# Rename doublet scores
for (i in seq(sc)) {
  if (!is.null(param$doublets)) {
    sc[[i]][[]] = dplyr::rename(sc[[i]][[]], DoubletScore=param$doublets)
  } 
}

The following first table shows available metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), the number of mapped reads (“nCounts_RNA”), or the above mentioned number of unique genes detected (“nFeature_RNA”). The second table shows available metadata (columns) of the first 5 genes (rows).

# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())

# Print cell metadata
knitr::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))  %>% 
  kableExtra::scroll_box(width="100%")
Cell metadata, top 5 rows
orig.ident nCount_RNA nFeature_RNA orig.dataset SampleName PlateNumber PlateRow PlateCol
PBMC1_10x_AAACCCACACTTGGGC-1 pbmc_10x 7552 2037 pbmc_10x NA NA NA NA
PBMC1_10x_AAACCCACAGGTGGAT-1 pbmc_10x 4773 1800 pbmc_10x NA NA NA NA
PBMC1_10x_AAACCCAGTGCTTATG-1 pbmc_10x 4430 1565 pbmc_10x NA NA NA NA
PBMC1_10x_AAACCCATCCGTAGTA-1 pbmc_10x 4512 1592 pbmc_10x NA NA NA NA
PBMC1_10x_AAACCCATCTTACACT-1 pbmc_10x 6663 1919 pbmc_10x NA NA NA NA
PBMC1_10x_AAACGAAGTCTAGTGT-1 pbmc_10x 143 89 pbmc_10x NA NA NA NA
# Print gene metadata
knitr::kable(head(sc[[1]][[param$assay_raw]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))  %>% 
  kableExtra::scroll_box(width="100%")
Feature metadata, top 5 rows (only first dataset shown)
feature_id feature_name feature_type
TSPAN6 ENSG00000000003 TSPAN6 Gene Expression
TNMD ENSG00000000005 TNMD Gene Expression
DPM1 ENSG00000000419 DPM1 Gene Expression
SCYL3 ENSG00000000457 SCYL3 Gene Expression
C1orf112 ENSG00000000460 C1orf112 Gene Expression

Pre-processing

The steps below represent a standard pre-processing workflow for single-cell RNA-seq data, including quality control, the respective filtering of cells and genes, data normalization and scaling, and the detection of highly variable genes.

Why is pre-processing so important? Cells may have been damaged during sample preparation and might be only partially captured in the sequencing. In addition, free-floating mRNA from damaged cells can be encapsulated, adding to the background noise. The low-quality cells and free-floating mRNA interfere with downstream analyses. Therefore, cells and genes are filtered based on defined quality metrics. Data normalization eliminates cell-specific biases such as the absolute number of reads per cell, allowing us to systematically compare cells afterwards. Subsequent scaling corrects for the fact that genes have different lengths, allowing us to compare genes afterwards. Lastly, highly variable genes are determined, reducing computational overhead and noise from genes that are not interesting.

Quality control

We start the analysis by removing unwanted cells from the dataset(s). Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature_RNA”), the total number of molecules detected in each cell (“nCount_RNA”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”). If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).

Which cells can be considered to be of low quality? On the one hand, low-quality cells such as dying, degraded and damaged cells, and empty droplets will often have very few genes (“nFeature_RNA”) and low total counts (“nCount_RNA”). On the other hand, cell doublets may show an aberrantly high number of genes. Since the total number of reads detected within a cell typically strongly correlates with the number of unique genes, we can focus on the number of unique genes for filtering. In addition, damaged cells often exhibit high mitochondrial (“percent_mt”) or spike-in (“percent_ercc”) content. As mitochondria have their own membranes, their RNA is often the last to degrade in damaged cells and can thus be found in high numbers. However, it is important to keep in mind that different cell types and cells isolated from various species may differ in their number of expressed genes and metabolism. For example, stem cells may express a higher number of unique genes, and metabolically active cells may express a higher number of mitochondrial genes.
Impact of low-quality cells on downstream analyses First of all, low-quality cells of different cell types might cluster together due to similarities in their damage-induced gene expression profiles. This leads to artificial intermediate states or trajectories between subpopulations that would otherwise be distinct. Second, low-quality cells might mask relevant gene expression changes. The main differences captured by the first principal components will likely be based on cell quality rather than the underlying biology, reducing the power of dimensionality reduction. Likewise, highly variable genes might just be genes that differ between low- and high-quality cells. Lastly and equally important, the low number of total counts in low-quality cells might lead to artificial upregulation of genes due to wrong scaling effects during the normalisation process.
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
  if (s %in% names(param$cell_filter)) {
    return(param$cell_filter[[s]])
  } else {
    return(param$cell_filter)
  }
})

param$feature_filter = purrr::map(list_names(sc), function(s) {
  if (s %in% names(param$feature_filter)) {
    return(param$feature_filter[[s]])
  } else {
    return(param$feature_filter)
  }
})
# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, function(s) {
  mt_features = grep(pattern=param$mt, rownames(s), value=TRUE)
  return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay=param$assay_raw))
})

# Calculate percentage of counts in ribosomal genes for each Seurat object
sc = purrr::map(sc, function(s) {
  ribo_features = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
  return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay=param$assay_raw))
})

# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
  if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
  return(s)
  })

# Combine (again) cell metadata of the Seurat objects into one big metadata, this time including mt and ercc 
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ return(s[[]]) }) %>% as.data.frame())
# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
#   This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
  # Calculate percentage of counts per gene in a cell
  counts_rna = Seurat::GetAssayData(sc[[n]], slot="counts", assay=param$assay_raw)
  total_counts = sc[[n]][[paste0("nCount_", param$assay_raw), drop=TRUE]]
  counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100

  # Calculate feature filters
  num_cells_expr = Matrix::rowSums(counts_rna >= 1)
  num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
  
  # Calculate median of counts_rna_perc per gene 
  counts_median = sparseMatrixStats::rowMedians(counts_rna_perc)
  
  # Add all QC measures as metadata
  sc[[n]][[param$assay_raw]] = Seurat::AddMetaData(sc[[n]][[param$assay_raw]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
  return(sc[[n]])
})
# QC metrics to plot for cells
cell_qc_features = c(paste0(c("nFeature_", "nCount_"), param$assay_raw), "percent_mt")
if ("DoubletScore" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_QC_features, "DoubletScore")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)

# Get filter thresholds per QC metrics (for plotting)
cell_qc_thresholds = purrr::map(cell_qc_features, function(m) {
  tresh = purrr::map_dfr(names(param$cell_filter), function(n) {
    if (m %in% names(param$cell_filter[[n]])) {
      t = data.frame(orig.ident=n, min=param$cell_filter[[n]][[m]][1], max=param$cell_filter[[n]][[m]][2]) %>% 
        tidyr::pivot_longer(cols=2:3, names_to=c("threshold")) %>%
        dplyr::filter(!is.na(value))
      t$threshold = factor(t$threshold, levels=c("min", "max"))
      return(t)
    }
  })
})

# Now create plot per QC metric
p_list = list()
for (m in cell_qc_features) {
  p_list[[m]]= ggplot(sc_cell_metadata[, c("orig.ident", m)], aes(x=orig.ident, y=!!sym(m), fill=orig.ident, group=orig.ident)) +
    geom_violin(scale="width")

  # Adds points for samples with less than three cells since geom_violin does not work here
  p_list[[m]] = p_list[[m]] + 
    geom_point(data=sc_cell_metadata[, c("orig.ident", m)] %>% dplyr::filter(orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes(x=orig.ident, y=!!sym(m), fill=orig.ident), shape=21, size=2)
  
  # Now add style
  p_list[[m]] = p_list[[m]] + 
    AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") + 
    theme(axis.text.x=element_text(angle=45, hjust=1))
  
  # Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
  if (nrow(cell_qc_thresholds[[m]]) > 0) {
    p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]], 
                                             aes(x=as.integer(as.factor(orig.ident))-0.5, 
                                                 xend=as.integer(as.factor(orig.ident))+0.5, 
                                                 y=value, yend=value, lty=threshold), colour="firebrick") +
      scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
  }
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values") 
p

# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))

# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=orig.ident)) +
  geom_point() + 
  scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
  AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
  p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
    
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
  p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
  

# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=orig.ident)) +
  geom_point() +
  scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
  AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
  p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
  p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}

# nFeature vs percent_ercc (if available)
if ("percent_ercc" %in% names(cell_qc_features)) {
  m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
  p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=orig.ident)) +
    geom_point() + 
    scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) + 
    AddStyle(col=param$col_samples)
  if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
    p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
  }
  if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
    p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
  }
}

# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("Features plotted against each other")
if (length(p_list) == 1) {
  p = p & theme(legend.position="bottom")
} else {
  p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
p

Genes with highest expression

We next investigate whether there are individual genes that are represented by an unusually high number of counts. For each cell, we first calculate the percentage of counts per gene. Subsequently, for each gene, we calculate the median value of these percentages in all cells. Genes with the highest median percentage of counts are plotted below.

# Plot only samples that we intend to keep 
sc_names = names(sc)[!(names(sc) %in% param$samples_to_drop)]
genes_highestExpr = lapply(sc_names, function(i) {
  top_ten_exp = sc[[i]][[param$assay_raw]][["counts_median"]] %>% dplyr::arrange(dplyr::desc(counts_median)) %>% head(n=10)
  return(rownames(top_ten_exp))
  }) %>%
  unlist() %>%
  unique()

genes_highestExpr_counts = purrr::map_dfc(sc[sc_names], .f=function(s) s[[param$assay_raw]][["counts_median"]][genes_highestExpr, ]) 
genes_highestExpr_counts$gene = genes_highestExpr
genes_highestExpr_counts = genes_highestExpr_counts %>% tidyr::pivot_longer(cols=all_of(sc_names))
genes_highestExpr_counts$name = factor(genes_highestExpr_counts$name, levels=sc_names)

col =  GenerateColours(num_colours=length(genes_highestExpr), names=genes_highestExpr, palette="ggsci::pal_simpsons")
p = ggplot(genes_highestExpr_counts, aes(x=name, y=value, col=gene, group=gene)) + 
  geom_point() + 
  AddStyle(title="Top 10 highest expressed genes per sample, added into one list", 
           xlab="Sample", ylab="Median % of raw counts\n per gene in a cell", 
           legend_position="bottom", 
           col=col)
if (length(unique(genes_highestExpr_counts$name))>1) p = p + geom_line()
p

Filtering

Cells and genes are filtered based on the following thresholds:

cell_filter_lst = param$cell_filter %>% unlist(recursive=FALSE)
is_numeric_filter = purrr::map_lgl(cell_filter_lst, function(f) return(is.numeric(f) & length(f)==2))

# numeric cell filters
if (length(cell_filter_lst[is_numeric_filter]) > 0) {
  purrr::exec(rbind, !!!cell_filter_lst[is_numeric_filter]) %>%
    knitr::kable(align="l", caption="Numeric filters applied to cells", col.names=c("Min", "Max")) %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
Numeric filters applied to cells
Min Max
pbmc_10x.nFeature_RNA NA NA
pbmc_10x.percent_mt NA NA
pbmc_smartseq2_sample1.nFeature_RNA NA NA
pbmc_smartseq2_sample1.percent_mt NA NA
# categorial cell filters
if (length(cell_filter_lst[!is_numeric_filter]) > 0) {
purrr::exec(rbind, !!!cell_filter_lst[!is_numeric_filter] %>% purrr::map(paste, collapse=",")) %>%
  knitr::kable(align="l", caption="Categorial filters applied to cells", col.names=c("Values")) %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
  
# gene filters
feature_filter_lst = param$feature_filter %>% unlist(recursive=FALSE)
if (length(feature_filter_lst) > 0) {
  purrr::exec(rbind, !!!feature_filter_lst) %>% 
    knitr::kable(align="l", caption="Filters applied to genes", col.names=c("Value")) %>%
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
Filters applied to genes
Value
pbmc_10x.min_counts 1
pbmc_10x.min_cells 3
pbmc_smartseq2_sample1.min_counts 1
pbmc_smartseq2_sample1.min_cells 3

The number of excluded cells and features is as follows:

# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude  = purrr::map(list_names(sc), function(n) { 
  filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
    filter = param$cell_filter[[n]][[f]]
    if (is.numeric(filter)) {
      if (is.na(filter[1])) filter[1] = -Inf # Minimum
      if (is.na(filter[2])) filter[2] = Inf  # Maximum 
      idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
      return(names(which(idx_exclude)))
    } else if (is.character(filter)) { 
      idx_exclude = !sc[[n]][[f, drop=TRUE]] %in% filter
      return(Cells(sc[[n]])[idx_exclude])
    }
  })

  # Samples to drop
  if (n %in% param$samples_to_drop) {
    filter_result[["samples_to_drop"]] = colnames(sc[[n]])
  } else {
    filter_result[["samples_to_drop"]] = as.character(c())
  }
  
  # Minimum number of cells for a sample to keep
  if (ncol(sc[[n]]) < param$samples_min_cells) {
    filter_result[["samples_min_cells"]] = colnames(sc[[n]])
  } else {
    filter_result[["samples_min_cells"]] = as.character(c())
  }
  
  return(filter_result)
})

# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
  return(as.data.frame(purrr::map(s, length))) 
  })
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)

sc_cells_to_exclude_summary$Original = purrr::map_int(sc, ncol)
sc_cells_to_exclude_summary$Excluded = purrr::map_int(sc_cells_to_exclude, function(s) { return(purrr::flatten(s) %>% unique() %>% length())})
sc_cells_to_exclude_summary$PercKept = round((sc_cells_to_exclude_summary$Original - sc_cells_to_exclude_summary$Excluded) / sc_cells_to_exclude_summary$Original * 100, 2)
knitr::kable(sc_cells_to_exclude_summary, 
             align="l", 
             caption="Summary of excluded cells") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Summary of excluded cells
nFeature_RNA percent_mt samples_to_drop samples_min_cells Original Excluded PercKept
pbmc_10x 0 0 0 0 4033 0 100
pbmc_smartseq2_sample1 0 0 0 0 311 0 100
# Add filter column to sc_cell_metadata for post-filtering QC
sc_cell_metadata$IS_FILTERED = rownames(sc_cell_metadata) %in% unlist(sc_cells_to_exclude)

# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
  cells_to_keep = Cells(sc[[n]])
  cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
  if (length(cells_to_keep)==0) return(NULL)
  else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)

if (length(sc)==1) param$integrate_samples[["method"]] = "single"
# Only RNA assay at the moment

# Iterate over samples and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
  
  # Make sure the sample contains more cells than the minimum threshold
  if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
  
  # Return gene names that do not pass the minimum threshold 
  else return(names(which(sc[[n]][[param$assay_raw]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})

# Which genes are to be filtered for all samples?
# Note: Make sure that no other sample is called "AllSamples"
sc_features_to_exclude$AllSamples = Reduce(f=intersect, x=sc_features_to_exclude)

# Summarise
sc_features_to_exclude_summary = purrr::map_dfr(names(sc), function(n){
  df = data.frame(Original=nrow(sc[[n]]), 
                  FailThreshold=length(sc_features_to_exclude[[n]]))
  df$PercFailThreshold = round(df$FailThreshold / df$Original * 100, 2)
  df$Kept = length(setdiff(rownames(sc[[n]]), sc_features_to_exclude[["AllSamples"]]))
  df$PercKept = round(df$Kept / df$Original * 100, 2)
  return(df)
})
rownames(sc_features_to_exclude_summary) = names(sc)
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Summary of excluded genes") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Summary of excluded genes
Original FailThreshold PercFailThreshold Kept PercKept
pbmc_10x 33694 13893 41.23 21312 63.25
pbmc_smartseq2_sample1 33694 15754 46.76 21312 63.25
# Now filter those genes that are to be filtered for all samples 
sc = purrr::map(list_names(sc), function(n) {
  assay_names = Seurat::Assays(sc[[n]])
  features_to_keep = purrr::map(values_to_names(assay_names), function(a) {
    features = rownames(sc[[n]][[a]])
    keep = features[!features %in% sc_features_to_exclude$AllSamples]
    return(keep)
  })
  return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})

After filtering, the size of the Seurat object is:

sc
## $pbmc_10x
## An object of class Seurat 
## 21312 features across 4033 samples within 1 assay 
## Active assay: RNA (21312 features, 0 variable features)
## 
## $pbmc_smartseq2_sample1
## An object of class Seurat 
## 21312 features across 311 samples within 1 assay 
## Active assay: RNA (21312 features, 0 variable features)

Quality control post filtering

The updated QC plots are:

# Now create plot per QC metric after filtering
p_list = list()
for (m in cell_qc_features) {
  p_list[[m]] = ggplot(sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED), aes(x=orig.ident, y=!!sym(m), fill=orig.ident, group=orig.ident)) +
    geom_violin(scale="width")

  # Adds points for samples with less than three cells since geom_violin does not work here
  p_list[[m]] = p_list[[m]] + 
    geom_point(data=sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED, orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes(x=orig.ident, y=!!sym(m), fill=orig.ident), shape=21, size=2)
  
  # Now add style
  p_list[[m]] = p_list[[m]] + 
    AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") + 
    theme(axis.text.x=element_text(angle=45, hjust=1))
  
  # Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
  if (nrow(cell_qc_thresholds[[m]]) > 0) {
    sample_names = sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED) %>% dplyr::pull(orig.ident) %>% unique()
    p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]] %>% dplyr::filter(orig.ident %in% sample_names), 
                                             aes(x=as.integer(as.factor(orig.ident))-0.5, 
                                                 xend=as.integer(as.factor(orig.ident))+0.5, 
                                                 y=value, yend=value, lty=threshold), colour="firebrick") +
      scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
  }
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values") 
p

# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))

# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=orig.ident, alpha=IS_FILTERED)) +
  geom_point() + 
  scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
  scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
  AddStyle(col=param$col_samples) +
  guides(alpha = "none")
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
  p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
  p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
  
# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=orig.ident, alpha="IS_FILTERED")) +
  geom_point() +
  scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
  scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
  AddStyle(col=param$col_samples) +
  guides(alpha = "none")
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
  p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
  p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}

# nFeature vs percent_ercc
if ("percent_ercc" %in% names(cell_qc_features)) {
  m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
  p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour="orig.ident", alpha="IS_FILTERED")) +
    geom_point() + 
    scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
    scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
    AddStyle(col=param$col_samples) +
    guides(alpha = "none")
  if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
    p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
  }
  if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
    p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
  }
}

# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("Features plotted against each other")
if (length(p_list)==1) {
  p = p & theme(legend.position="bottom")
} else {
  p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
p

# downsample_cells_n overwrites downsample_cells_equally
if (!is.null(param$downsample_cells_n)) {
  n = param$downsample_cells_n
} else if (param$downsample_cells_equally) {
  n = purrr::map_int(sc, ncol) %>% min()  
}

# Actual downsampling
if (!is.null(param$downsample_cells_n) | param$downsample_cells_equally) {
  sc = purrr::map(sc, function(s) {
    cells = ScSampleCells(sc=s, n=n, seed=1)
    return(subset(s, cells=cells))
  })
  
  # Adjust combined metadata accordingly
  sc_cell_metadata = sc_cell_metadata[unlist(purrr::map(sc, Cells)), ]
  
  message("Your data has been downsampled.")
  print(sc)
}
# Remove filtered cells from metadata
sc_cell_metadata = sc_cell_metadata %>% dplyr::filter(IS_FILTERED==FALSE)

# Update levels but make sure level order stays the same
idx.factors = sapply(sc_cell_metadata, is.factor) %>% which()
for (n in colnames(sc_cell_metadata[idx.factors])) {
  levels_old = sc_cell_metadata %>% dplyr::pull(n) %>% levels()
  levels_new = sc_cell_metadata %>% dplyr::pull(n) %>% as.character() %>% unique()
  sc_cell_metadata[[n]] = factor(sc_cell_metadata[[n]], levels=levels_old[levels_old %in% levels_new])
}

# Update actual colors as well, as they will appear in the plots otherwise
param$col_samples = param$col_samples[names(param$col_samples) %in% names(sc)]

Normalisation

In this section, we subsequently run a series of Seurat functions for each provided sample:

1. We start by running a standard log normalisation, where counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
What do we need normalisation for?

The number of raw sequencing reads per cell is influenced by technical differences in the capture, reverse transcription and sequencing of RNA molecules, particularly due to the difficulty of achieving consistent library preparation with minimal starting material. Thus, comparing gene expression between cells may reveal differences that are solely due to sampling effects. After low-quality cells were removed in the previous step, the primary goal of normalization is to remove technical sampling effects while preserving the true biological signal.

Count depth scaling is the simplest and most commonly used normalization strategy. The underlying idea is that each cell initially contained an equal number of mRNA molecules, and differences arise due to sampling effects. For each cell, the number of reads per gene is divided by a cell-specific “size factor”, which is proportional to the total count depth of the cell. The resulting normalized data add up to 1 per cell, and is then typically multiplied by a factor of 10 (10,000 in this workflow).

Finally, normalized data are log-transformed for three important reasons. First, distances between log-transformed expression data represent log fold changes. Log-transformation emphasizes contributions from genes with strong relative differences, for example a gene that is expressed at an average count of 50 in cell type A and 10 in cell type B rather than a gene that is expressed at an average count of 1100 in A and 1000 in B. Second, log-transformation mitigates the relation between mean and variance in the data. Lastly, log-transformation reduces that skewness of the data as many downstream analyses require the data to be normally distributed.
2. We assign cell cycle scores to each cell based on its normalised expression of G2/M and S phase markers. These scores are visualised in a separate section further below. If specified in the above parameter section, cell cycle effects are removed during scaling (step 3). Cell cycle effects removed for this report: FALSE; all cell cycle effects removed for this report: FALSE.
How does removal of cell cycle effects affect the data? Note that removing all signal associated to cell cycle can negatively impact downstream analysis. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. An alternative approach is to remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences amongst proliferating cells will be removed. For a more detailed explanation, see the cell cycle vignette for Seurat https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html.
3. Dependent on the normalisation of your choice, we either
a. Run standard functions to select variable genes, and scale normalised gene counts. For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly in others. To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score).
What do we need scaling for? After normalization, gene expression data can be compared between cells. However, expression of individual genes still cannot be compared. This is because genes have different lengths and, depending on the experimental set up, longer genes can be represented by a higher number of reads. To account for this effect, normalized data are further scaled using a z-transformation, resulting in the average expression of 0 and the variance of 1 for each gene across all cells. Note that additional unwanted sources of variations can be regressed out during the scaling process, such as cell cycle effects or the percentage of mitochondrial reads.
b. Run SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, variable genes and scaling).
What is SCTransform special about? The standard log-transformation applied in step 1 assumes that count depth influences all genes equally. However, it has been shown that the use of a single size factor will introduce different effects on highly and lowly expressed genes (Hafemeister and Satija (2019)). SCTransform is a new statistical approach for the modelling, normalization and variance stabilization of single-cell RNA-seq data, and is an alternative to steps 1 and 3a described above. Note that SCTransform has been developed for UMI count data and can therefore safely be applied to 10x but not SmartSeq-2 data. As for the scaling in step 3a, additional unwanted sources of variations can be regressed out during SCTransform.

Normalisation method used for this report: RNA; with additional sources of variance regressed out: .

While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data include variable genes only, potentially without cell cycle effects, and are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.

# Normalise data the original way
#   This is required to score cell cycle (https://github.com/satijalab/seurat/issues/1679)
sc = purrr::map(sc, Seurat::NormalizeData, normalization.method="LogNormalize", scale.factor=10000, verbose=FALSE)
# Determine cell cycle effect per sample 
sc = purrr::map(list_names(sc), function(n) {
  sc[[n]] = CCScoring(sc=sc[[n]], genes_s=genes_s[,2], genes_g2m=genes_g2m[,2], name=n)
  if (any(is.na(sc[[n]][["S.Score"]])) | any(is.na(sc[[n]][["G2M.Score"]]))) {
    param$cc_remove=FALSE
    param$cc_remove_all=FALSE
    param$cc_rescore_after_merge=FALSE
  }
  return(sc[[n]])
})

# If cell cycle effects should be removed, we first score cells 
# The effect is then removed in the following chunk 
if (param$cc_remove) {
# Add to vars that need to regressed out during normalisation
  if (param$cc_remove_all) {
    # Remove all signal associated to cell cycle
    param$vars_to_regress = unique(c(param$vars_to_regress, "S.Score", "G2M.Score"))
    param$latent_vars = unique(c(param$latent_vars, "S.Score", "G2M.Score"))
  } else {
    # Don't remove the difference between cycling and non-cycling cells 
    param$vars_to_regress = unique(c(param$vars_to_regress, "CC.Difference"))
    param$latent_vars = unique(c(param$latent_vars, "CC.Difference"))
  }  
}
if (param$norm == "RNA") { 
  # Find variable features from normalised data (unaffected by scaling)
  sc = purrr::map(sc, Seurat::FindVariableFeatures, selection.method="vst", nfeatures=3000, verbose=FALSE)
  
  # Scale 
  # Note: For a single dataset where no integration/merging is needed, all features can already be scaled here. 
  #   Otherwise, scaling of all features will be done after integration/merging.
  if (param$integrate_samples[["method"]]=="single") {
    sc[[1]] = Seurat::ScaleData(sc[[1]], 
                      features=rownames(sc[[1]][["RNA"]]),
                      vars.to.regress=param$vars_to_regress, 
                      verbose=FALSE) 
  }
} else if (param$norm == "SCT") {
  # Run SCTransform
  #
  # This is a new normalisation method that replaces previous Seurat functions "NormalizeData", "FindVariableFeatures", and "ScaleData". 
  # vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
  # paper: https://www.biorxiv.org/content/10.1101/576827v2
  # Normalised data end up here: sc@assays$SCT@data
  # Note: For a single dataset where no integration is needed, all features can already be scaled here. 
  #   Otherwise, it is enough to scale only the variable features.
  # Note: It is not guaranteed that all genes are successfully normalised with SCTransform. 
  #   Consequently, some genes might be missing from the SCT assay. 
  #   See: https://github.com/ChristophH/sctransform/issues/27
  # Note: The performance of SCTransform can be improved by using "glmGamPoi" instead of "poisson" as method for initial parameter estimation.
  sc = purrr::map(list_names(sc), function(n) { 
    SCTransform(sc[[n]], 
                assay=param$assay_raw,
                vars.to.regress=param$vars_to_regress, 
                min_cells=param$feature_filter[[n]][["min_cells"]], 
                verbose=FALSE, 
                return.only.var.genes=!(param$integrate_samples[["method"]]=="single"),
                method=ifelse(packages_installed("glmGamPoi"), "glmGamPoi", "poisson")) 
  })
}

Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single-cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells. Highly variable genes are typically the genes with a cell type specific expression profile, and are often the genes of interest in single-cell experiments. Housekeeping genes, with similar levels of expression across all cells, or genes with minor expression differences, might add random noise and mask relevant changes during downstream dimensionality reduction and clustering. We therefore aim to select a sensible set of variable genes that includes interesting biological signal and excludes noise. Here, the top 3,000 variable genes are selected and used for downstream analysis.

How are variable genes selected? To determine variable genes, we need to separate biological variability from technical variability. Technical variability arises especially for lowly expressed genes, where high variability corresponds to small absolute changes that we are not interested in. Here, we use the variance-stabilizing transformation (vst) method implemented in Seurat (Hafemeister and Satija (2019)). This method first models the technical variability as a relationship between mean gene expression and variance using local polynomial regression. The model is then used to calculate the expected variance based on the observed mean gene expression. The difference between the observed and expected variance is called residual variance and likely reflects biological variability.
fig_height_vf = 5 * ceiling(length(names(sc))/2)
p_list = purrr::map(list_names(sc), function(n) {
  top10 = head(Seurat::VariableFeatures(sc[[n]], assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw)), 10)
  p = Seurat::VariableFeaturePlot(sc[[n]], 
                                  assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw), 
                                  selection.method=ifelse(param$norm=="RNA", "vst", "sct"), 
                                  col=c("grey", param$col)) + 
    AddStyle(title=n) + 
    theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
  p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
  return(p)
})

p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Variable genes")
p

Combining multiple samples

if (param$integrate_samples[["method"]]!="single") {
  
  # When merging, feature meta-data is removed by Seurat entirely; save separately for each assay except for SCT and add again afterwards
  # Note: not sure whether this is still needed - discuss
  assay_names = setdiff(unique(purrr::flatten_chr(purrr::map(list_names(sc), function(n) { Seurat::Assays(sc[[n]]) } ))), "SCT")

  # Loop through all assays and accumulate meta data
  sc_feature_metadata = purrr::map(values_to_names(assay_names), function(a) {
    # "feature_id", "feature_name", "feature_type" are accumulated for all assays and stored just once
    # This step is skipped for assays that do not contain all three types of feature information
    contains_neccessary_columns = purrr::map_lgl(list_names(sc), function(n) { 
      all(c("feature_id", "feature_name", "feature_type") %in% colnames(sc[[n]][[a]][[]])) 
      })

    if (all(contains_neccessary_columns)) {
      feature_id_name_type = purrr::map(sc, function(s) return(s[[a]][[c("feature_id", "feature_name", "feature_type")]]) )
      feature_id_name_type = purrr::reduce(feature_id_name_type, function(df_x, df_y) {
        new_rows = which(!rownames(df_y) %in% rownames(df_x))
        if (length(new_rows) > 0) return(rbind(df_x, df_y[new_rows, ]))
        else return(df_x)
      })
      feature_id_name_type$row_names = rownames(feature_id_name_type)
    } else {
      feature_id_name_type = NULL
    }
    
    # For all other meta-data, we prefix column names with the dataset
    other_feature_data = purrr::map(list_names(sc), function(n) {
      df = sc[[n]][[a]][[]]
      if (contains_neccessary_columns[[n]]) df = df %>% dplyr::select(-dplyr::one_of(c("feature_id", "feature_name", "feature_type"), c()))
      if (ncol(df) > 0) colnames(df) = paste(n, colnames(df), sep=".")
      df$row_names = rownames(df)
      return(df)
    })
    
    # Now join everything by row_names by full outer join
    if (!is.null(feature_id_name_type)) {
      feature_data = purrr::reduce(c(list(feature_id_name_type=feature_id_name_type), other_feature_data), dplyr::full_join, by="row_names")
    } else {
      feature_data = purrr::reduce(other_feature_data, dplyr::full_join, by="row_names")
    }
    rownames(feature_data) = feature_data$row_names
    feature_data$row_names = NULL
    
    return(feature_data)
  })
  
  # When merging, cell meta-data are merged but factors are not kept
  sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame())
  sc_cell_metadata_factor_levels = purrr::map(which(sapply(sc_cell_metadata, is.factor)), function(n) {
    return(levels(sc_cell_metadata[, n, drop=TRUE]))
  })
}
# Standard method for integrating multiple samples. Best performance but computationally intensive.
if (param$integrate_samples[["method"]]=="integrate") {
  if (!"use_reciprocal_pca" %in% names(param$integrate_samples)) param$integrate_samples[["use_reciprocal_pca"]] = FALSE

  # Run integration
  sc = RunIntegration(sc, 
                      assay=param$norm,
                      ndims=param$integrate_samples[["dimensions"]], 
                      verbose=FALSE, 
                      reference=param$integrate_samples[["reference"]], 
                      use_reciprocal_pca=param$integrate_samples[["use_reciprocal_pca"]], 
                      k_filter=param$integrate_samples[["k.filter"]], 
                      k_weight=param$integrate_samples[["k.weight"]], 
                      k_anchor=param$integrate_samples[["k.anchor"]],
                      k_score=param$integrate_samples[["k.score"]])

  # Re-score cell cycle effects after integration
  if (param$cc_rescore_after_merge) {
    sc = CCScoring(sc=sc, genes_s=genes_s[,2], genes_g2m=genes_g2m[,2])
    if (any(is.na(sc[["S.Score"]])) | any(is.na(sc[["G2M.Score"]])))  {
      param$cc_remove=FALSE
      param$cc_remove_all=FALSE
      param$cc_rescore_after_merge=FALSE
      param$vars_to_regress = setdiff(param$vars_to_regress, c("S.Score", "G2M.Score", "CC.Difference"))
      param$latent_vars = setdiff(param$latent_vars, c("S.Score", "G2M.Score", "CC.Difference"))
    }
  }
  
  # (Re-)Run scaling
  if (param$norm == "RNA") {
    # According to Seurat, we need to scale data again for "RNAintegrated", and "RNA"
    DefaultAssay(sc) = "RNAintegrated"
    sc = Seurat::ScaleData(sc, 
                                features=rownames(sc[["RNAintegrated"]]), 
                                vars.to.regress=param$vars_to_regress, 
                                assay="RNAintegrated",
                                verbose=FALSE)
      
    DefaultAssay(sc) = "RNA"
    sc = Seurat::ScaleData(sc, 
                                features=rownames(sc[["RNA"]]), 
                                vars.to.regress=param$vars_to_regress, 
                                assay="RNA",
                                verbose=FALSE)
  } else if (param$norm == "SCT") {
    # We need to re-run SCTransform for the "SCT" assay again, to normalise on the complete dataset
    DefaultAssay(sc) = "SCT"
    min_cells_overall = max(purrr::map_int(param$feature_filter, function(f) as.integer(f[["min_cells"]])))
    sc = SCTransform(sc, 
                     assay=param$assay_raw, 
                     vars.to.regress=param$vars_to_regress, 
                     min_cells=min_cells_overall,
                     verbose=FALSE,
                     return.only.var.genes=FALSE,
                     method=ifelse(packages_installed("glmGamPoi"), "glmGamPoi", "poisson"))
  }
  
  # Add feature metadata
  for (a in Seurat::Assays(sc)) {
    if (a %in% names(sc_feature_metadata)) {
      sc[[a]] = Seurat::AddMetaData(sc[[a]], sc_feature_metadata[[a]][rownames(sc[[a]]),, drop=FALSE])
    }
  }
  
  # Fix cell metadata factors
  for (f in names(sc_cell_metadata_factor_levels)) {
    sc[[f]] = factor(sc[[f, drop=TRUE]], levels=sc_cell_metadata_factor_levels[[f]])
  }
  
  # Add sample colours again to misc slot
  sc = ScAddLists(sc, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")

  # Set default assay (will be the integrated version)
  DefaultAssay(sc) = paste0(param$norm, "integrated")  
  
  message("Data values for all samples have been integrated.")
  print(sc)
  
  # Add sample as latent_vars for marker detection
  param$latent_vars = c(param$latent_vars, "orig.ident")
}

× (Message)
Data values for all samples have been integrated.

## An object of class Seurat 
## 24312 features across 4344 samples within 2 assays 
## Active assay: RNAintegrated (3000 features, 3000 variable features)
##  1 other assay present: RNA

Relative log expression

n_cells_rle_plot = 100

# Sample at most 100 cells per dataset and save their identity
cells_subset = sc[["orig.ident"]] %>% tibble::rownames_to_column() %>% 
  dplyr::group_by(orig.ident) %>% 
  dplyr::sample_n(size=min(n_cells_rle_plot, length(orig.ident))) %>% 
  dplyr::select(rowname, orig.ident)

To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in at most 100 randomly selected cells per sample before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.

For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#374E55FF, #DF8F44FF)

Raw counts

# Plot raw data
p = PlotRLE(as.matrix(log2(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=param$assay_raw, slot="counts") + 1)), 
            id=cells_subset$orig.ident, 
            col=param$col_samples) + 
  labs(title="log2(raw counts + 1)")
p

Normalised data

Dependent on the context, this tab refers to different data:

  • Single sample: RNA normalisation of the single sample
  • Multiple samples that were merged: Combined RNA normalisation post merging of all samples
  • Multiple samples that were integrated: Separate RNA normalisation prior to integration of all samples
# Plot normalised data
p = PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw), slot="data")), 
            id=cells_subset$orig.ident, 
            col=param$col_samples) + 
  labs(title="Normalised data")
p

Integrated data

if (! param$integrate_samples[["method"]] %in% c("single", "merge")) {
  # Plot integrated data
  p = PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=paste0(param$norm, "integrated"), slot="data")), 
              id=cells_subset$orig.ident, 
              col=param$col_samples) + 
    labs(title="Integrated data")
  p
} else {
  message("No integrated data available.")
}

Dimensionality reduction

A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes, since expression profiles of different genes are correlated if they are involved in the same biological process. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarize a dataset, and to visualize a dataset.

We use Principal Component Analysis (PCA) to summarize a dataset, overcoming noise and reducing the data to its essential components. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualize the dataset, placing similar cells together in 2D space, see below.

PCA in a nutshell

Principal Component Analysis is a way to summarize a dataset and to reduce noise by averaging similar gene expression profiles. The information for correlated genes is compressed into single dimensions called principal components (PCs) and the analysis identifies those dimensions that capture the largest amount of variation. This process gives a more precise representation of the patterns inherent to complex and large datasets.

In a PCA, the first PC captures the greatest variance across the whole dataset. The next PC captures the greatest remaining amount of variance, and so on. This way, the top PCs are likely to represent the biological signal where multiple genes are affected by the same biological processes in a coordinated way. In contrast, random technical or biological noise that affects each gene independently are contained in later PCs. Downstream analyses can be restricted to the top PCs to focus on the most relevant biological signal and to reduce noise and unnecessary computational overhead.

To decide how many PCs to include in downstream analyses, we visualise the cells and genes that define the PCA.

# Run PCA for default normalisation
sc = Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE, npcs=min(50, ncol(sc)))
p_list = Seurat::VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE, balanced=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
p =  patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Top gene loadings of the first two PCs") 
p

p = Seurat::DimPlot(sc, reduction="pca", cols=param$col_samples) + 
  AddStyle(title="Cells arranged by the first two PCs", legend_position="bottom")
p

p_list = Seurat::DimHeatmap(sc, dims=1:min(20, ncol(sc)), cells=min(500, ncol(sc)), balanced=TRUE, fast=FALSE, combine=FALSE)
p_list = purrr::map(seq(p_list), function(i) {
  p_list[[i]] = p_list[[i]] + 
    ggtitle(paste("PC", i)) + 
    theme(legend.position="none", axis.text.y=element_text(size=8))
  return(p_list[[i]])
  })
p = patchwork::wrap_plots(p_list, ncol=3) + patchwork::plot_annotation("Top gene loadings of the first PCs")
p

Dimensionality of the dataset

We next need to decide how many PCs we want to use for our analyses. PCs include biological signal as well as noise, and we need to determine the number of PCs with which we include as much biological signal as possible and as little noise as possible. The following “Elbow plot” is designed to help us make an informed decision. It shows PCs ranked based on the percentage of variance they explain.

How do we determine the number of PCs for downstream analysis? The top PC captures the greatest variance across cells and each subsequent PC represents decreasing levels of variance. By visual inspection of the Elbow plot, we try to find the point at which we can explain most of the variance across cells. Commonly, the top 10 PCs are chosen. It may be helpful to repeat downstream analyses with a different number of PCs, although the results often do not differ dramatically. Note that it is recommended to rather choose too many than too few PCs.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=min(50, ncol(sc))) + 
  geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) + 
  AddStyle(title="Elbow plot") 
p

# Cannot have more PCs than number of cells
param$pc_n = min(param$pc_n, ncol(sc))

For the current dataset, 10 PCs were chosen.

Clustering

Seurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm (Traag, Waltman, and van Eck 2019).

Further explanation on clustering

At this point, we would like to define subpopulations of cells with similar gene expression profiles using unsupervised clustering. Clusters ultimately serve as approximations for biological objects like cell types or cell states.

During the first step of clustering, a K-nearest neighbor (KNN) graph is constructed. In simplified terms this means that cells are connected to their K nearest neighbors based on cell-to-cell expression similarity using the PCs chosen in the previous step. The higher the similarity is, the higher the edge weight becomes. During the second step, the graph is partitioned into highly interconnected communities, whereby each community represents a cluster of cells with similar expression profiles. The separation of the graph into clusters is dependent on the chosen resolution. For scRNA-seq datasets of around 3000 cells, it is recommended to use a resolution value between 0.4 and 1.2. This value can be set even higher for larger datasets. Note that the choice of PCs and cluster resolution is an arbitrary one. Therefore, it is highly recommended to evaluate clusters and re-run the workflow with adapted parameters if needed.

To get a first idea about how different cluster resolution values influence the clustering, we run and visualize the clustering multiple times, see below. For this report, you chose the resolution value 0.5 as the final value for further analyses.

cluster_resolutions = sort(unique(c(param$cluster_resolution, param$cluster_resolution_test)))

# Construct phylogenetic tree relating the "average" cell from each sample
if (length(levels(sc$orig.ident)) > 1) {
  sc = suppressWarnings(Seurat::BuildClusterTree(sc, features=rownames(sc), verbose=FALSE))
  Seurat::Misc(sc, "trees") = list(orig.ident = Seurat::Tool(sc, "Seurat::BuildClusterTree"))
}

# The number of clusters can be optimized by tuning "resolution" -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n, verbose=FALSE, k.param=param$cluster_k)

# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
# But we can run multiple resolutions if requested
sc = suppressWarnings(Seurat::FindClusters(sc, resolution=cluster_resolutions, algorithm=4, verbose=FALSE, method="igraph"))

# Construct phylogenetic tree relating the "average" cell from each cluster for each clustering
# Also add colour lists for each clustering
for(r in cluster_resolutions) {
  n = paste0(DefaultAssay(sc), "_snn_res.", r)
  
  # Tree
  if (length(levels(sc[[n, drop=TRUE]])) > 1) {
    Seurat::Idents(sc) = n
    sc = suppressWarnings(Seurat::BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE))
    l = list(Seurat::Tool(sc, "Seurat::BuildClusterTree"))
    names(l) = n
    suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), l)})
  }
  
  col = GenerateColours(num_colours=length(levels(sc[[n, drop=TRUE]])), names=levels(sc[[n, drop=TRUE]]), 
                                     palette=param$col_palette_clusters, alphas=1)
  # Colours
  l = list(col)
  names(l) = n
  sc = ScAddLists(sc, lists=l, lists_slot="colour_lists")
}

# Set default clustering
n = paste0(DefaultAssay(sc), "_snn_res.", param$cluster_resolution)
sc$seurat_clusters = sc[[n, drop=TRUE]]
Seurat::Idents(sc) = sc$seurat_clusters
if (length(levels(sc$seurat_clusters)) > 1) {
  suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), list(seurat_clusters = Seurat::Misc(sc, "trees")[[n]]))})
}

# Set up colors for default clustering
sc = ScAddLists(sc, lists=list(seurat_clusters=Misc(sc, "colour_lists")[[n]]), lists_slot="colour_lists")
param$col_clusters = Misc(sc, "colour_lists")[["seurat_clusters"]]

# Define height of test clusters
height_per_row = 3
nr_cols = 3
nr_rows = ceiling(length(cluster_resolutions)/nr_cols)
fig_height_test_clusters = nr_rows * height_per_row
# Default UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot", n.neighbors=param$umap_k))

# If there are any test resolutions other than the default, go ahead
if (length(cluster_resolutions) > 1) {
  p_list = list()
  for(r in cluster_resolutions) {
    r = as.character(r)
    n = paste0(DefaultAssay(sc), "_snn_res.", r)
    
    cluster_cells = table(sc[[n, drop=TRUE]])
    cluster_labels = paste0(names(cluster_cells)," (", cluster_cells,")")
    
    p_list[[r]] = Seurat::DimPlot(sc, reduction="umap", group.by=n, pt.size=param$pt_size, label=TRUE) + 
      scale_color_manual(values=Seurat::Misc(sc, "colour_lists")[[n]], labels=cluster_labels) +
      AddStyle(title=r, legend_position="none") +
      FontSize(x.text = 10, y.text = 10, x.title = 13, y.title = 13, main = 15)
  }
  
  p = patchwork::wrap_plots(p_list, ncol=nr_cols) + patchwork::plot_annotation(title="UMAP, cells coloured by cluster identity for different resolution values")
  print(p)
}

Visualisation with UMAP

We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.

Take care not to mis-read a UMAP:

  • Parameters influence the plot (we use defaults here)
  • Cluster sizes relative to each other mean nothing, since the method has a local notion of distance
  • Distances between clusters might not mean anything
  • You may need more than one plot

For a nice read to intuitively understand UMAP, see https://pair-code.github.io/understanding-umap/.

Coloured by cluster

# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters") + 
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters")
p = LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p

Coloured by cluster (per sample)

# Plot all samples separately
# Repel will be deactivated if there are more than six samples; otherwise the plot may crash
p = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters")
p = LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white", repel=ifelse(length(unique(sc$orig.ident)) <= 5, TRUE, FALSE))
p

Coloured by sample

sample_cells = table(sc$orig.ident)
sample_labels = paste0(levels(sc$orig.ident)," (", sample_cells[levels(sc$orig.ident)],")")
# Note: This is a hack to colour by sample but label by Cluster
p = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident") +
  scale_color_manual(values=param$col_samples, labels=sample_labels) +
  AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white")
p

Coloured by sample (per sample)

# Plot all samples separately
# Repel will be deactivated if there are more than six samples; otherwise the plot may crash
p = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
  scale_color_manual(values=param$col_samples, labels=sample_labels) +
  AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel=ifelse(length(unique(sc$orig.ident)) <= 5, TRUE, FALSE))
p

Distribution of samples in clusters

# Count cells per cluster per sample 
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% levels()
cell_clusters = sc[[]] %>% dplyr::pull(seurat_clusters) %>% levels()

tbl = dplyr::count(sc[[c("orig.ident", "seurat_clusters")]], orig.ident, seurat_clusters) %>% tidyr::pivot_wider(names_from="seurat_clusters", names_prefix="Cl_", values_from=n, values_fill=0) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"],"_n")
tbl[,"orig.ident"] = NULL

# Add percentages
tbl_perc = round(t(tbl) / colSums(tbl) * 100, 2) %>% t()
rownames(tbl_perc) = gsub(rownames(tbl_perc), pattern="_n$", replacement="_perc", perl=TRUE)
tbl = rbind(tbl, tbl_perc)

# Add enrichment
if (length(cell_samples) > 1 & length(cell_clusters) > 1) tbl = rbind(tbl, CellsFisher(sc))

# Sort
tbl = tbl[order(rownames(tbl)),, drop=FALSE]

# Plot percentages
tbl_bar = tbl[paste0(cell_samples, "_perc"), , drop=FALSE] %>% 
  tibble::rownames_to_column(var="Sample") %>%
  tidyr::pivot_longer(tidyr::starts_with("Cl"), names_to="Cluster", values_to="Percentage")
tbl_bar$Cluster = tbl_bar$Cluster %>% gsub(pattern="^Cl_", replacement="", perl=TRUE) %>% factor(levels=sc$seurat_clusters %>% levels())
tbl_bar$Sample = tbl_bar$Sample %>% gsub(pattern="_perc$", replacement="", perl=TRUE) %>% as.factor()
tbl_bar$Percentage = as.numeric(tbl_bar$Percentage)
p = ggplot(tbl_bar, aes(x=Cluster, y=Percentage, fill=Sample)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="Percentage cells of samples in clusters",
           fill=param$col_samples,
           legend_title="Sample",
           legend_position="bottom")
p

The following table shows the number of cells per sample and cluster:

  • n: Number of cells per sample and cluster
  • perc: Percentage of cells per sample and cluster compared to all other cells of that cluster

In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher or lower than expected:

  • oddsRatio: Odds ratio calculated for cluster c1 and sample s1 as (# cells s1 in c1 / # cells not s1 in c1) / (# cells s1 not in c1 / # cells not s1 not in c1)
  • p: P-value calculated with a Fisher test to test whether “n” is higher or lower than expected
# Print table
knitr::kable(tbl, align="l", caption="Number of cells per sample and cluster") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
  kableExtra::scroll_box(width="100%", fixed_thead=TRUE)
Number of cells per sample and cluster
Cl_1 Cl_2 Cl_3 Cl_4 Cl_5 Cl_6 Cl_7 Cl_8 Cl_9 Cl_10
pbmc_10x_n 904 594 571 448 416 328 338 268 126 40
pbmc_10x_perc 91.78 92.67 94.85 91.24 94.55 90.61 95.21 93.06 94.03 86.96
pbmc_10x.oddsRatio 0.82 0.97 1.49 0.78 1.38 0.72 1.58 1.04 1.22 0.51
pbmc_10x.p 9.4e-01 6.1e-01 2.1e-02 9.4e-01 8.3e-02 9.6e-01 3.9e-02 5.0e-01 3.7e-01 9.6e-01
pbmc_smartseq2_sample1_n 81 47 31 43 24 34 17 20 8 6
pbmc_smartseq2_sample1_perc 8.22 7.33 5.15 8.76 5.45 9.39 4.79 6.94 5.97 13.04
pbmc_smartseq2_sample1.oddsRatio 1.22 1.03 0.67 1.28 0.73 1.39 0.63 0.97 0.82 1.96
pbmc_smartseq2_sample1.p 8.2e-02 4.5e-01 9.9e-01 8.8e-02 9.4e-01 5.7e-02 9.8e-01 5.9e-01 7.5e-01 1.1e-01
# Reset default assay, so we won't plot integrated data
# Note: We need integrated data for UMAP, clusters
DefaultAssay(sc) = ifelse(param$norm=="SCT", param$norm, param$assay_raw)

Cell Cycle Effect

How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? After initial normalisation, we determined the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt. This section of the report visualises the above calculated cell cycle scores.

# Set up colours for cell cycle effect and add to sc object
col =  GenerateColours(num_colours=length(levels(sc$Phase)), names=levels(sc$Phase), palette="ggsci::pal_npg", alphas=1)
sc = ScAddLists(sc, lists=list(Phase=col), lists_slot="colour_lists")

# Get a feeling for how many cells are affected
p1 = ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) + 
  geom_point() + 
  scale_x_continuous("S score") + 
  scale_y_continuous("G2/M score") + 
  AddStyle(col=Misc(sc, "colour_lists")[["Phase"]])

p2 = ggplot(sc@meta.data %>% 
              dplyr::group_by(seurat_clusters, Phase) %>% 
              dplyr::summarise(num_cells=length(Phase)), 
            aes(x=seurat_clusters, y=num_cells, fill=Phase)) + 
  geom_bar(stat="identity", position="fill") + 
  scale_x_discrete("Seurat clusters") + 
  scale_y_continuous("Fraction of cells") + 
  AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) + 
  theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1)) 

p3 = ggplot(sc[[]] %>% 
              dplyr::group_by(orig.ident, Phase) %>% 
              dplyr::summarise(num_cells=length(Phase)), 
            aes(x=orig.ident, y=num_cells, fill=Phase)) + 
  geom_bar(stat="identity", position="fill") + 
  scale_y_continuous("Fraction of cells") +
  AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) + 
  theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + xlab("")

p = p1 + p2 + p3 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation(title="Cell cycle phases") + plot_layout(guides="collect")
p

UMAP coloured by cell cycle phases

if (any(!is.na(sc$Phase))) {
  # UMAP with phases superimposed
  # Note: This is a hack to colour by phase but label by Cluster
  p = Seurat::DimPlot(sc, group.by="Phase", pt.size=1, cols=Misc(sc, "colour_lists")[["Phase"]]) + 
    AddStyle(title="UMAP, cells coloured by cell cycle phases", legend_title="Phase")
  p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
  p = LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
  p
}

UMAP coloured by S phase

if (any(!is.na(sc$Phase))) {
  p = Seurat::FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
    AddStyle(title="UMAP, cells coloured by S phase")
  p = LabelClusters(p, id="ident", box=TRUE, fill="white")
  p
}

UMAP coloured by G2/M phase

if (any(!is.na(sc$Phase))) {
  p = Seurat::FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) + 
    AddStyle(title="UMAP, cells coloured by G2M phase")
  p = LabelClusters(p, id="ident", box=TRUE, fill="white")
  p
}

UMAP coloured by the difference between S and G2/M phase

if (any(!is.na(sc$Phase))) {
  p = Seurat::FeaturePlot(sc, features="CC.Difference", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
    AddStyle(title="UMAP, cells coloured by CC.Difference")
  p = LabelClusters(p, id="ident", box=TRUE, fill="white")
  p
}

Cluster QC

Do cells in individual clusters have particularly high counts, detected genes or mitochondrial content?

Number of counts

qc_feature = paste0("nCount_", param$assay_raw)
p1 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature) + 
  AddStyle(title="Feature plot") + 
  scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
p1 = LabelClusters(p1, id="ident", box=FALSE)


p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=!!sym(qc_feature), fill=seurat_clusters, group=seurat_clusters)) + 
  geom_violin(scale="width") + 
  AddStyle(title="Violin plot (log10 scale)", fill=param$col_clusters,
           xlab="Cluster", legend_position="none") + 
  scale_y_log10()

p = p1 | p2 
p = p + patchwork::plot_annotation(title=paste0("Summed raw counts (", qc_feature, ", log10 scale)"))
p

m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=seurat_clusters)) + 
  geom_point() + 
  AddStyle(col=param$col_clusters) +
  facet_wrap(~seurat_clusters) +
  theme(legend.position = "none")
p

Number of features

qc_feature = paste0("nFeature_", param$assay_raw)
p1 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature) + 
  AddStyle(title="Feature plot") + 
  scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
p1 = LabelClusters(p1, id="ident", box=FALSE)

p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=!!sym(qc_feature), fill=seurat_clusters, group=seurat_clusters)) + 
  geom_violin(scale="width") + 
  AddStyle(title="Violin plot", fill=param$col_clusters,
           xlab="Cluster", legend_position="none") + 
  scale_y_log10()

p = p1 | p2 
p = p + patchwork::plot_annotation(title=paste0("Number of features with raw count > 0 (", qc_feature, ", log10 scale)"))
p

m = paste0(c("nFeature_", "nFeature_"), param$assay_raw)
p = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=seurat_clusters)) + 
  geom_point() + 
  AddStyle(col=param$col_clusters) +
  facet_wrap(~seurat_clusters) +
  theme(legend.position = "none")
p

Percent mitochondrial reads

p1 = Seurat::FeaturePlot(sc, features="percent_mt", cols=c("lightgrey", param$col)) + 
  AddStyle(title="Feature plot")
p1 = LabelClusters(p1, id="ident", box=FALSE)

p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_mt, fill=seurat_clusters, group=seurat_clusters)) + 
  geom_violin(scale="width") + 
  AddStyle(title="Violin plot", fill=param$col_clusters,
           xlab="Cluster", legend_position="none")
p = p1 | p2 
p = p + patchwork::plot_annotation(title="Percent of mitochondrial features (percent_mt)")
p

m = c("percent_mt", paste0("nFeature_", param$assay_raw))
p = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=seurat_clusters)) + 
  geom_point() + 
  AddStyle(col=param$col_clusters) +
  facet_wrap(~seurat_clusters) +
  theme(legend.position = "none")
p

Percent ribosomal reads

p1 = Seurat::FeaturePlot(sc, features="percent_ribo", cols=c("lightgrey", param$col)) +
  AddStyle(title="Feature plot")
p1 = LabelClusters(p1, id="ident", box=FALSE)

p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_ribo, fill=seurat_clusters, group=seurat_clusters)) +
  geom_violin(scale="width") +
  AddStyle(title="Violin plot", fill=param$col_clusters,
           xlab="Cluster", legend_position="none")
p = p1 | p2
p = p + patchwork::plot_annotation(title="Percent of ribosomal features (percent_ribo)")
p

m = c("percent_ribo", paste0("nFeature_", param$assay_raw))
p = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour="seurat_clusters")) + 
  geom_point() + 
  AddStyle(col=param$col_clusters) +
  facet_wrap(~seurat_clusters) +
  theme(legend.position = "none")
p

Doublet score

if ("DoubletScore" %in% colnames(sc[[]])) {
  p1 = Seurat::FeaturePlot(sc, features="DoubletScore", cols=c("lightgrey", param$col)) + 
    AddStyle(title="Feature plot")
  p1 = LabelClusters(p1, id="ident", box=FALSE)
  
  p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=DoubletScore, fill=seurat_clusters, group=seurat_clusters)) + 
    geom_violin(scale="width") + 
    AddStyle(title="Violin plot", fill=param$col_clusters,
             xlab="Cluster", legend_position="none")
  p = p1 | p2 
  p = p + patchwork::plot_annotation(title="Doublet score")
  p
  
  m = c("DoubletScore", paste0("nFeature_", param$assay_raw))
  p = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=seurat_clusters)) + 
    geom_point() + 
    AddStyle(col=param$col_clusters) +
    facet_wrap(~seurat_clusters) +
    theme(legend.position = "none")
  p
} else {
  message("No doublet scores found.")
}

× (Message)
No doublet scores found.

Doublet yes/no

if ("DoubletScore" %in% colnames(sc[[]])) {
  col_doublets = c("TRUE"="darkblue", "FALSE"="lightgrey")
  p1 = Seurat::DimPlot(sc, reduction="umap", group.by="IsDoublet", order="TRUE") + 
    scale_color_manual(values=col_doublets) +
    AddStyle(title="UMAP, cells coloured by doublet decision", legend_position="bottom", legend_title="IsDoublet")
  p1$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
  
  p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=DoubletScore, fill=seurat_clusters, group=seurat_clusters)) + 
    geom_violin(scale="width") + 
    AddStyle(title="Violin plot", fill=param$col_clusters,
             xlab="Cluster", legend_position="none")
  p = p1 | p2 
  p = p + patchwork::plot_annotation(title="Doublet score")
  p
  
  m = c("DoubletScore", paste0("nFeature_", param$assay_raw))
  p = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=seurat_clusters)) + 
    geom_point() + 
    AddStyle(col=param$col_clusters) +
    facet_wrap(~seurat_clusters) +
    theme(legend.position = "none")
  p
} else {
  message("No doublet scores found.")
}

× (Message)
No doublet scores found.

Percent ERCC

if ("percent_ercc" %in% colnames(sc[[]])) {
  p1 = Seurat::FeaturePlot(sc, features="percent_ercc", cols=c("lightgrey", param$col)) +
    AddStyle(title="Feature plot")
  p1 = LabelClusters(p1, id="ident", box=FALSE)
  
  p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_ercc, fill=seurat_clusters, group=seurat_clusters)) +
    geom_violin(scale="width") +
    AddStyle(title="Violin plot", fill=param$col_clusters,
             xlab="Cluster", legend_position="none")
  p = p1 | p2
  p = p + patchwork::plot_annotation(title="Percent of ERCC features (percent_ercc)")
  print(p)
  
  # Plot percent_ercc vs nFeature by cluster
  m = c("percent_ercc", paste0("nFeature_", param$assay_raw))
  p = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=seurat_clusters)) + 
    geom_point() + 
    AddStyle(col=param$col_clusters) +
    facet_wrap(~seurat_clusters) +
    theme(legend.position = "none")
  print(p)
} else {
  message("No ERCC controls found.")
}

× (Message)
No ERCC controls found.

Known marker genes

Do cells in individual clusters express provided known marker genes?

known_markers_list=c()

# Overwrite empty list of known markers 
if (!is.null(param$file_known_markers)) {
  # Read known marker genes and map to rownames
  known_markers = openxlsx::read.xlsx(param$file_known_markers)
  known_markers_list = lapply(colnames(known_markers), function(x) {
    x_ens = known_markers[, x] %>% na.exclude() # Remove NAs that come from reading
    x_ens = gsub(x_ens, pattern=" ", replacement="", fixed=TRUE) 
    x_gene = ensembl_to_seurat_rowname[x_ens] 
    i = is.na(x_gene) # Genes that could not be translated
    j = !x_gene %in% rownames(sc) # Genes that were filtered
    if (any(i)){
      Warning(paste0("The following genes of marker list '", x, "' cannot be translated to symbols: ", x_ens[i]))
    }
    if (any(j)){
      Warning(paste0("The following genes of marker list '", x, "' cannot be found in the data: ", x_ens[j]))
    }
    x_gene = x_gene[!(i | j)] %>% unique()
    return(x_gene)
  })
  
  # Remove empty lists
  names(known_markers_list) = colnames(known_markers)
  is_empty = purrr::map_int(known_markers_list, .f=length) == 0 
  known_markers_list = known_markers_list[!is_empty]
  
  # Add lists to sc object
  sc = ScAddLists(sc, lists=setNames(known_markers_list, paste0("known_marker_", names(known_markers_list))), lists_slot="gene_lists")
}  

# Set plot options
if(length(known_markers_list) > 0) { 
  known_markers_n = length(known_markers_list) 
  known_markers_vect = unlist(known_markers_list) %>% unique() %>% sort()
  idx_dotplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) <= 50)
  idx_avgplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) >= 3)
} else { 
  known_markers_n=0
  idx_dotplot = idx_avgplot = FALSE
  known_markers_vect = c()
}

# Define number of average violin plots (no more than 6 lists per plot)
nr_avgviolin = ceiling(length(idx_avgplot) / 6)
nr_lists_per_avgviolin = ceiling(length(idx_avgplot) / nr_avgviolin)
# The height of feature and violin plots is independent of the number of clusters
# The height of dotplots is dependent on the number of clusters
height_per_row = 3
height_per_row_variable = max(2, 0.3 * length(levels(sc$seurat_clusters)))

# The total heights of average feature plots and dotplots depend on the number of lists provided
fig_height_knownMarkers_avgplot = max(height_per_row, height_per_row * sum(idx_avgplot))
# fig_height_knownMarkers_dotplot = max(height_per_row_variable, height_per_row_variable * sum(idx_dotplot)) %>% min(150)
fig_height_knownMarkers_dotplot = min(height_per_row_variable, 7)
fig_height_knownMarkers_avgvioplot = max(height_per_row_variable, height_per_row_variable * nr_avgviolin)

# The total height of individual feature plots depends on the total number of known markers
nr_rows = ceiling(length(known_markers_vect)/4)
fig_height_knownMarkers_vect = max(height_per_row, height_per_row * nr_rows)

You provided 7 list(s) of known marker genes. In the following tabs, you find:

  • Dot plots for all gene lists containing at most 50 genes
  • Average feature/violin plots for all gene lists containing at least 3 genes
  • Individual feature plots for all genes if there are no more than 200 genes in total

Dot plot(s)

A dot plot visualises how gene expression changes across different clusters. The size of a dot encodes the percentage of cells in a cluster that expresses the gene, while the color encodes the scaled average expression across all cells within the cluster. Per gene (column), we group cells based on cluster identity (rows), calculate average expression per cluster, subtract the mean of average expression values and divide by the standard deviation. The resulting scores describe how high or low a gene is expressed in a cluster compared to all other clusters.

if ((known_markers_n > 0) & any(idx_dotplot)) {
  known_markers_dotplot = known_markers_list[idx_dotplot]
  for (i in seq(known_markers_dotplot)) {
    g = known_markers_dotplot[[i]]
    g = g[length(g):1]
    p = suppressMessages(
      Seurat::DotPlot(sc, features=g) + 
        scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
        AddStyle(title=paste("Known marker genes:", names(known_markers_dotplot)[i]), ylab="Cluster") + 
        theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
        lims(size=c(0,100))
      )
    print(p)
  }
} else if ((known_markers_n > 0) & !any(idx_dotplot)) {
  message("This tab is used for dot plots for up to 50 genes. All provided lists are longer than this, and hence dot plots are skipped.")
} else {
  message("No known marker genes were provided and hence dot plots are skipped.")
}

Average feature plot(s)

An average feature plot visualises the average gene expression of each gene list on a single-cell level, subtracted by the aggregated expression of control feature sets. The color of the plot encodes the calculated scores, whereat positive scores suggest that genes are expressed more highly than expected.

if ((known_markers_n > 0) & any(idx_avgplot)) {
  known_markers_avgplot = known_markers_list[idx_avgplot]
  sc = Seurat::AddModuleScore(sc, features=known_markers_avgplot, ctrl=10, name="known_markers")
  idx_replace_names = grep("^known_markers[0-9]+$", colnames(sc@meta.data), perl=TRUE)
  colnames(sc@meta.data)[idx_replace_names] = names(known_markers_avgplot)
  p_list = Seurat::FeaturePlot(sc, features=names(known_markers_avgplot), cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
  for (i in seq(known_markers_avgplot)) {
    p_list[[i]] = p_list[[i]] + AddStyle(title=paste("Known marker genes:", names(known_markers_avgplot)[i]))
  }
  p = patchwork::wrap_plots(p_list, ncol=1)
  print(p)
} else if ((known_markers_n > 0) & !any(idx_avgplot)) {
  message("This tab is used to plot an average for 3 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.")
} else {
  message("No known marker genes were provided and hence average feature plots are skipped.")
}

× (Message)
This tab is used to plot an average for 3 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.

Average violin plot

if ((known_markers_n > 0) & any(idx_avgplot)) {
  p_list = list()
  for (i in 1:nr_avgviolin) {
    idx1 = (1+((i-1)*nr_lists_per_avgviolin))
    idx2 = min(idx1 + nr_lists_per_avgviolin - 1, length(known_markers_avgplot))

    p_list[[i]] = Seurat::VlnPlot(sc, features=names(known_markers_avgplot)[idx1:idx2], pt.size=0, stack=TRUE, group.by="seurat_clusters",
                                  fill.by="ident", idents=sc$seurat_clusters, cols=param$col_clusters) +
      AddStyle(legend_position="none")
  }
  p = patchwork::wrap_plots(p_list, ncol=1)
  p
} else if ((known_markers_n > 0) & !any(idx_avgplot)) {
  message("This tab is used to plot an average for 3 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.")
} else {
  message("No known marker genes were provided and hence average feature plots are skipped.")
}

× (Message)
This tab is used to plot an average for 3 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.

Individual feature plots

An individual feature plot colours single cells on the UMAP according to their normalised gene expression.

if ((known_markers_n > 0) & length(known_markers_vect) <= 200) {
  p_list = Seurat::FeaturePlot(sc, features=known_markers_vect, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
  for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
  p = patchwork::wrap_plots(p_list, ncol=4)
  print(p)
} else if (length(known_markers_vect) > 200) { 
  message("This tab is used to plot up to 200 known marker genes. Your provided list is longer than this, and hence individual feature plots are skipped.")
} else {
  message("No known marker genes were provided and hence individual feature plots are skipped.")
}

Marker genes

We next identify genes that are differentially expressed in one cluster compared to all other clusters, based on raw RNA data and the method “MAST”. Resulting p-values are adjusted using the Bonferroni method. However, note that the p-values are likely inflated, since both clusters and marker genes were determined based on the same gene expression data, and there ought to be gene expression differences by design. Nevertheless, p-values can be used to sort and prioritize marker genes. We require marker genes to be expressed in at least 25% of cells in the respective cluster, with a minimum log2 fold change of 1 and adjusted p-value of at most 0.05. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.

What are marker genes?

As described above, cell clusters approximate cell types and states. But how do we know which cell types these are? To characterize cell clusters, we identify marker genes. Good marker genes are genes that are particularly expressed in one cluster, and existing knowledge of these marker genes can be used to extrapolate biological functions for the cluster. A good clustering of cells typically results in good marker genes. Hence, if you cannot find good marker genes you may need to go back to the start of the workflow and adapt your parameters. Note that we also determine genes that are particularly down-regulated in one cluster, even if these are not marker genes in the classical sense.

Good marker genes are highly and possibly even only expressed in one cluster as compared to all other clusters. However, sometimes marker genes are also expressed in other clusters, or are declared as marker genes in these clusters, for example cell lineage markers that are shared by different cell subtypes. To evaluate marker genes, it is essential to visualize their expression patterns.

In addition to detecting marker genes, it might be informative to detect genes that are differentially expressed between one specific cluster and one or several other clusters. This approach allows a more specific distinction of individual clusters and investigation of more subtle differences, see the section “Differentially expressed genes” below.
# Find DEGs for every cluster compared to all remaining cells, report positive (=markers) and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells 
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers 

# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA"/"Spatial" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
# Note: By default, the function uses slot="data". Mast requires log data, so this is the correct way to do it.
#   https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-interoperability.html
markers = suppressMessages(Seurat::FindAllMarkers(sc, assay=param$assay_raw, test.use="MAST",
                                               only.pos=FALSE, min.pct=param$marker_pct, logfc.threshold=param$marker_log2FC,
                                               latent.vars=param$latent_vars, verbose=FALSE, silent=TRUE))

# If no markers were found, initialise the degs table so that further downstream (export) chunks run
if (ncol(markers)==0) markers = DegsEmptyMarkerResultsTable(levels(sc$seurat_clusters))

# For Seurat versions until 3.2, log fold change is based on the natural log. Convert to log base 2.
if ("avg_logFC" %in% colnames(markers) & !"avg_log2FC" %in% colnames(markers)) {
  lfc_idx = grep("avg_log\\S*FC", colnames(markers))
  markers[,lfc_idx] = marker_deg_results[,lfc_idx] / log(2)
  col_nms = colnames(markers)
  col_nms[2] = "avg_log2FC"
  colnames(markers) = col_nms
}

# Sort markers
markers = markers %>% DegsSort(group=c("cluster"))
  
# Filter markers 
markers_filt = DegsFilter(markers, cut_log2FC=param$marker_log2FC, cut_padj=param$marker_padj)
markers_found = nrow(markers_filt$all)>0

# Add average data to table
markers_out = cbind(markers_filt$all, DegsAvgDataPerIdentity(sc, genes=markers_filt$all$gene, assay=param$assay_raw))

# Split by cluster and write to file
additional_readme = data.frame(Column=c("cluster",
                                        "p_val_adj_score",
                                        "avg_<assay>_<slot>_id<cluster>"), 
                               Description=c("Cluster",
                                             "Score calculated as follows: -log10(p_val_adj)*sign(avg_log2FC)",
                                             "Average expression value for cluster; <assay>: RNA or SCT; <slot>: raw counts or normalised data"))

invisible(DegsWriteToFile(split(markers_out, markers_out$cluster),
                                       annot_ensembl=annot_ensembl,
                                       gene_to_ensembl=seurat_rowname_to_ensembl,
                                       additional_readme=additional_readme,
                                       file=file.path(param$path_out, "marker_degs", "markers_cluster_vs_rest.xlsx")))


# Plot number of differentially expressed genes
p = MarkersPlotNumbers(markers=markers_filt$all, 
                       group="cluster", 
                       title=paste0("Number of marker genes, comparing each cluster to the rest\n(FC=", 2^param$marker_log2FC, ", adj. p-value=", param$marker_padj, ")")) 

# Add marker table to seurat object
Seurat::Misc(sc, "markers") = list(condition_column="seurat_clusters", test="MAST", padj=param$marker_padj, 
                                   log2FC=param$marker_log2FC, min_pct=param$marker_pct, assay=param$assay_raw, slot="data",
                                   latent_vars=param$latent_vars,
                                   results=markers_filt$all,
                                   enrichr=EmptyEnrichrDf(overlap_split=TRUE))

# Add marker lists to seurat object
marker_genesets_up = split(markers_filt$up$gene, markers_filt$up$cluster)
names(marker_genesets_up) = paste0("markers_up_cluster", names(marker_genesets_up))
marker_genesets_down = split(markers_filt$down$gene, markers_filt$down$cluster)
names(marker_genesets_down) = paste0("markers_down_cluster", names(marker_genesets_down))
sc = ScAddLists(sc, lists=c(marker_genesets_up, marker_genesets_down), lists_slot="gene_lists")

if (markers_found) {
  p
} else {
  warning("No differentially expressed genes (cluster vs rest) found. The following related code is not executed, no related plots and tables are generated.")
}

Table of top marker genes

We use the term “marker genes” to specifically describe genes that are up-regulated in cells of one cluster compared to the rest.

if (markers_found) {
  markers_top = DegsUpDisplayTop(markers_filt$up, n=6)
  
  # Add labels
  markers_top$labels = paste0(markers_top$cluster, ": ", markers_top$gene)

  # Show table
  knitr::kable(markers_top %>% dplyr::select(-labels), align="l", caption="Up to top 5 marker genes per cell cluster") %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    kableExtra::scroll_box(width="100%", height="700px") 
}
Up to top 5 marker genes per cell cluster
cluster gene avg_log2FC p_val p_val_adj pct.1 pct.2
1 GZMH 2.394 0.0e+00 0.0e+00 0.978 0.196
1 CD8A 2.104 0.0e+00 0.0e+00 0.832 0.220
1 NKG7 2.097 0.0e+00 0.0e+00 1.000 0.432
1 FGFBP2 2.047 0.0e+00 0.0e+00 0.895 0.178
1 CST7 1.563 0.0e+00 0.0e+00 0.986 0.336
1 GZMB 1.376 8.4e-321 1.8e-316 0.831 0.175
2 GZMK 2.387 3.5e-248 7.4e-244 0.608 0.045
2 DUSP2 1.771 1.2e-199 2.6e-195 0.871 0.365
2 IL7R 1.098 8.9e-119 1.9e-114 0.774 0.288
2 PIK3R1 1.123 9.1e-107 1.9e-102 0.802 0.447
2 LYAR 1.088 7.8e-103 1.7e-98 0.750 0.320
2 TNFAIP3 1.234 3.6e-101 7.6e-97 0.850 0.508
3 IL7R 1.860 8.7e-265 1.8e-260 0.949 0.265
3 LTB 1.962 2.7e-263 5.8e-259 0.940 0.304
3 ZFP36L2 1.434 2.2e-183 4.8e-179 0.987 0.760
3 LDHB 1.249 1.8e-168 3.8e-164 0.939 0.562
3 SARAF 1.052 3.4e-157 7.3e-153 0.997 0.830
3 LEPROTL1 1.227 9.0e-153 1.9e-148 0.927 0.484
4 LYZ 5.847 0.0e+00 0.0e+00 0.982 0.097
4 S100A9 5.789 0.0e+00 0.0e+00 0.980 0.059
4 VCAN 4.338 0.0e+00 0.0e+00 0.974 0.025
4 FCN1 3.865 0.0e+00 0.0e+00 0.971 0.050
4 CD14 3.349 0.0e+00 0.0e+00 0.900 0.008
4 CLEC7A 2.741 0.0e+00 0.0e+00 0.933 0.052
5 CD79A 3.992 0.0e+00 0.0e+00 0.995 0.016
5 IGHM.1 3.967 0.0e+00 0.0e+00 0.795 0.010
5 HLA-DQA1 3.068 0.0e+00 0.0e+00 0.975 0.027
5 MS4A1 2.800 0.0e+00 0.0e+00 0.880 0.015
5 LINC00926 2.656 0.0e+00 0.0e+00 0.873 0.017
5 BANK1 2.359 0.0e+00 0.0e+00 0.839 0.010
6 KLRF1 2.227 7.3e-267 1.6e-262 0.787 0.038
6 GNLY 3.039 2.9e-235 6.2e-231 0.972 0.288
6 CD247 1.997 9.6e-227 2.0e-222 0.959 0.413
6 PRF1 2.411 4.9e-223 1.0e-218 0.975 0.328
6 GZMB 2.309 5.5e-216 1.2e-211 0.956 0.266
6 SPON2 2.294 2.2e-181 4.6e-177 0.754 0.139
7 NRGN 7.534 0.0e+00 0.0e+00 0.927 0.083
7 PPBP 6.943 0.0e+00 0.0e+00 0.741 0.018
7 PF4 6.562 0.0e+00 0.0e+00 0.783 0.014
7 CAVIN2 6.462 0.0e+00 0.0e+00 0.783 0.013
7 HIST1H2AC 6.425 0.0e+00 0.0e+00 0.766 0.034
7 GNG11 6.096 0.0e+00 0.0e+00 0.718 0.016
8 LEF1 1.708 2.6e-160 5.5e-156 0.733 0.069
8 CCR7 1.741 6.6e-153 1.4e-148 0.753 0.083
8 RPL13 1.008 1.5e-126 3.2e-122 1.000 0.915
8 RPL32 1.032 1.3e-116 2.7e-112 1.000 0.915
8 TCF7 1.452 1.0e-112 2.2e-108 0.858 0.283
8 RPS12 1.041 1.2e-109 2.6e-105 1.000 0.912
9 LST1 3.889 2.7e-246 5.7e-242 1.000 0.191
9 FCGR3A 3.397 3.2e-210 6.7e-206 1.000 0.139
9 IFITM3 3.311 1.2e-201 2.6e-197 0.993 0.254
9 CDKN1C.1 3.259 1.5e-195 3.1e-191 0.918 0.019
9 AIF1 3.548 6.4e-190 1.4e-185 1.000 0.162
9 MS4A7 2.854 4.0e-181 8.4e-177 0.985 0.079
10 HLA-DPB1.2 3.486 2.3e-83 4.9e-79 1.000 0.525
10 HLA-DPA1.2 3.694 4.5e-81 9.5e-77 1.000 0.506
10 FCER1A 2.925 4.6e-60 9.8e-56 0.739 0.008
10 CD74 3.284 7.6e-57 1.6e-52 1.000 0.722
10 PLD4 2.363 3.2e-54 6.7e-50 0.891 0.038
10 HLA-DRB1 3.297 5.0e-53 1.1e-48 1.000 0.501

Visualisation of top marker genes

The following plots visualise the top marker genes for each cluster, respectively. Clear marker genes indicate good clusters that represent cell types.

# Note: We need to run this chunk as it specifies a variable that is used in chunk definitions below
if (markers_found) {

  # The height of feature and violin plots is independent of the number of clusters
  # The height of dotplots is dependent on the number of clusters
  height_per_row = 3
  height_per_row_variable = max(3, 0.3 * length(levels(sc$seurat_clusters)))

  # Feature plots and violin plots: each row contains 3 plots
  #   The plot has 3 columns and 2 rows per cluster, hence the layout works nicely if we find 
  #     at least 6 markers per cluster
  nr_rows_3cols = ceiling(nrow(markers_top)/3)
  fig_height_3cols = height_per_row * nr_rows_3cols
  
  # Dotplots: each row contains 2 plots
  nr_rows_dp_2cols = ceiling(length(levels(sc$seurat_clusters))/2)
  fig_height_dp_2cols = height_per_row_variable * nr_rows_dp_2cols %>% min(100)
  
} else {
  fig_height_3cols = fig_height_dp_2cols = 7 
}

Feature plots

if (markers_found) {
  # Plot each marker one by one, and then combine them all at the end
  p_list = list()
  for (i in 1:nrow(markers_top)) { 
    p_list[[i]] = Seurat::FeaturePlot(sc, features=markers_top$gene[i], 
                                      cols=c("lightgrey", param$col_clusters[markers_top$cluster[i]]),  
                                      combine=TRUE, label=TRUE) + 
      AddStyle(title=markers_top$labels[i], 
               xlab="", ylab="", 
               legend_position="bottom")
  }
  
  # Combine all plots
  p = patchwork::wrap_plots(p_list, ncol=3) + 
    patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data, top marker genes per cluster")
  p
}

Violin plots (normalised)

if (markers_found) {
  # Plot violin plots per marker gene, and combine it all at the end
  # This layout works out nicely if there are 4 marker genes per cluster
  p_list = list()
  for(i in 1:nrow(markers_top)) { 
    p_list[[i]] = Seurat::VlnPlot(sc, features=markers_top$gene[i], assay=param$assay_raw, pt.size=0, cols=param$col_clusters) + 
      AddStyle(title=markers_top$labels[i], xlab="") + 
      theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1)) 
  }
  p = patchwork::wrap_plots(p_list, ncol=3) + 
    patchwork::plot_annotation(title="Violin plot of for normalised gene expression data, top marker genes per cluster") & theme(legend.position="none")
  p
}

Dot plot (scaled)

if (markers_found) {
  # Visualises how feature expression changes across different clusters
  # Plot dotplots per cluster, and combine it all at the end
  p_list = lapply(markers_top$cluster %>% sort() %>% unique(), function(cl) {
    genes = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
    p = suppressMessages(Seurat::DotPlot(sc, features=genes) + 
                           scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
                           AddStyle(title=paste0("Top marker genes for cluster ", cl, " (scaled)"), ylab="Cluster", legend_position="bottom") + 
                           theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + 
                           guides(size=guide_legend(order=1)))
    return(p)
  })
  
  p = patchwork::wrap_plots(p_list, ncol=2) 
  p
}

Dot plot (non-scaled)

if (markers_found) {
  # Visualises how feature expression changes across different clusters
  # Plot dotplots per cluster, and combine it all at the end
  p_list = lapply(markers_top$cluster %>% sort() %>% unique(), function(cl) {
    genes = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
    genes = genes[length(genes):1]
    p = suppressMessages(DotPlotUpdated(sc, features=genes, scale=FALSE, cols=c("lightgrey", param$col)) + 
                           AddStyle(title=paste0("Top marker genes for cluster ", cl, " (not scaled)"), ylab="Cluster", legend_position="bottom") + 
                           theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + 
                           guides(size=guide_legend(order=1)))
    return(p)
  })
      
  p = patchwork::wrap_plots(p_list, ncol=2)
  p
}

× (Warning) aes_string() was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation ideoms with aes()

Heatmaps

All up- and down-regulated genes

if (markers_found) {
  # This will sample 500 cells; the number of cells per seurat_cluster will be proportional
  cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
    
  # Heatmap of all differentially expressed genes
  p = Seurat::DoHeatmap(sc, features=markers_filt$all$gene, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) + 
    NoLegend() + 
    theme(axis.text.y=element_blank()) +
    ggtitle("Heatmap of scaled gene expression data, all genes differentially expressed between a cluster and the rest")
  p
}

Top 300 up-regulated genes

if (markers_found) {
  # This will sample 500 cells; the number of cells per seurat_cluster will be proportional
  cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
  # With fig.height = 20, 300 features can be shown; distribute among clusters
  features_per_group = 300/length(levels(markers_filt$up$cluster))
  features_subset = markers_filt$up %>% 
    dplyr::group_by(cluster) %>% 
    dplyr::top_n(n=features_per_group, wt=avg_log2FC) %>% 
    dplyr::arrange(cluster, -avg_log2FC) %>%
    dplyr::pull(gene) %>%
    unique()
  
  # Heatmap of top up-regulated genes
  p = Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) + 
    NoLegend() + 
    theme(axis.text.y=element_text(size=8)) +
    ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
  p
}

Top 300 down-regulated genes

if (markers_found) {
  # This will sample 500 cells; the number of cells per seurat_cluster will be proportional
  cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
  # With fig.height = 20, 300 features can be shown; distribute among clusters
  features_per_group = 300/length(levels(markers_filt$down$cluster))
  features_subset = markers_filt$down %>% 
    dplyr::group_by(cluster) %>% 
    dplyr::top_n(n=features_per_group, wt=-avg_log2FC) %>% 
    dplyr::arrange(cluster, avg_log2FC) %>%
    dplyr::pull(gene) %>%
    unique()
  
  # Heatmap of top down-regulated genes
  p = Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) + 
    NoLegend() + 
    theme(axis.text.y=element_text(size=8)) +
    ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
  p
}

Functional enrichment analysis

To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.

We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website https://amp.pharm.mssm.edu/Enrichr/. You can choose to test functional enrichment from a wide range of databases:

# Set Enrichr database
enrichR::setEnrichrSite(param$enrichr_site)

× (Message)
Connection changed to https://maayanlab.cloud/Enrichr/

× (Message)
Connection is Live!

# List databases
dbs_all = enrichR::listEnrichrDbs()
knitr::kable(dbs_all, align="l", caption="Enrichr databases") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
  kableExtra::scroll_box(width="100%", height="300px")
Enrichr databases
geneCoverage genesPerTerm libraryName link numTerms appyter categoryId
13362 275 Genome_Browser_PWMs http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ 615 ea115789fcbf12797fd692cec6df0ab4dbc79c6a 1
27884 1284 TRANSFAC_and_JASPAR_PWMs http://jaspar.genereg.net/html/DOWNLOAD/ 326 7d42eb43a64a4e3b20d721fc7148f685b53b6b30 1
6002 77 Transcription_Factor_PPIs 290 849f222220618e2599d925b6b51868cf1dab3763 1
47172 1370 ChEA_2013 http://amp.pharm.mssm.edu/lib/cheadownload.jsp 353 7ebe772afb55b63b41b79dd8d06ea0fdd9fa2630 7
47107 509 Drug_Perturbations_from_GEO_2014 http://www.ncbi.nlm.nih.gov/geo/ 701 ad270a6876534b7cb063e004289dcd4d3164f342 7
21493 3713 ENCODE_TF_ChIP-seq_2014 http://genome.ucsc.edu/ENCODE/downloads.html 498 497787ebc418d308045efb63b8586f10c526af51 7
1295 18 BioCarta_2013 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 249 4a293326037a5229aedb1ad7b2867283573d8bcd 7
3185 73 Reactome_2013 http://www.reactome.org/download/index.html 78 b343994a1b68483b0122b08650201c9b313d5c66 7
2854 34 WikiPathways_2013 http://www.wikipathways.org/index.php/Download_Pathways 199 5c307674c8b97e098f8399c92f451c0ff21cbf68 7
15057 300 Disease_Signatures_from_GEO_up_2014 http://www.ncbi.nlm.nih.gov/geo/ 142 248c4ed8ea28352795190214713c86a39fd7afab 7
4128 48 KEGG_2013 http://www.kegg.jp/kegg/download/ 200 eb26f55d3904cb0ea471998b6a932a9bf65d8e50 7
34061 641 TF-LOF_Expression_from_GEO http://www.ncbi.nlm.nih.gov/geo/ 269 1
7504 155 TargetScan_microRNA http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 222 f4029bf6a62c91ab29401348e51df23b8c44c90f 7
16399 247 PPI_Hub_Proteins http://amp.pharm.mssm.edu/X2K 385 69c0cfe07d86f230a7ef01b365abcc7f6e52f138 2
12753 57 GO_Molecular_Function_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 1136 f531ac2b6acdf7587a54b79b465a5f4aab8f00f9 7
23726 127 GeneSigDB https://pubmed.ncbi.nlm.nih.gov/22110038/ 2139 6d655e0aa3408a7accb3311fbda9b108681a8486 4
32740 85 Chromosome_Location http://software.broadinstitute.org/gsea/msigdb/index.jsp 386 8dab0f96078977223646ff63eb6187e0813f1433 7
13373 258 Human_Gene_Atlas http://biogps.org/downloads/ 84 0741451470203d7c40a06274442f25f74b345c9c 5
19270 388 Mouse_Gene_Atlas http://biogps.org/downloads/ 96 31191bfadded5f96983f93b2a113cf2110ff5ddb 5
13236 82 GO_Cellular_Component_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 641 e1d004d5797cbd2363ef54b1c3b361adb68795c6 7
14264 58 GO_Biological_Process_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 5192 bf120b6e11242b1a64c80910d8e89f87e618e235 7
3096 31 Human_Phenotype_Ontology http://www.human-phenotype-ontology.org/ 1779 17a138b0b70aa0e143fe63c14f82afb70bc3ed0a 3
22288 4368 Epigenomics_Roadmap_HM_ChIP-seq http://www.roadmapepigenomics.org/ 383 e1bc8a398e9b21f9675fb11bef18087eda21b1bf 1
4533 37 KEA_2013 http://amp.pharm.mssm.edu/lib/keacommandline.jsp 474 462045609440fa1e628a75716b81a1baa5bd9145 7
10231 158 NURSA_Human_Endogenous_Complexome https://www.nursa.org/nursa/index.jsf 1796 7d3566b12ebc23dd23d9ca9bb97650f826377b16 2
2741 5 CORUM http://mips.helmholtz-muenchen.de/genre/proj/corum/ 1658 d047f6ead7831b00566d5da7a3b027ed9196e104 2
5655 342 SILAC_Phosphoproteomics http://amp.pharm.mssm.edu/lib/keacommandline.jsp 84 54dcd9438b33301deb219866e162b0f9da7e63a0 2
10406 715 MGI_Mammalian_Phenotype_Level_3 http://www.informatics.jax.org/ 71 c3bfc90796cfca8f60cba830642a728e23a53565 7
10493 200 MGI_Mammalian_Phenotype_Level_4 http://www.informatics.jax.org/ 476 0b09a9a1aa0af4fc7ea22d34a9ae644d45864bd6 7
11251 100 Old_CMAP_up http://www.broadinstitute.org/cmap/ 6100 9041f90cccbc18479138330228b336265e09021c 7
8695 100 Old_CMAP_down http://www.broadinstitute.org/cmap/ 6100 ebc0d905b3b3142f936d400c5f2a4ff926c81c37 7
1759 25 OMIM_Disease http://www.omim.org/downloads 90 cb2b92578a91e023d0498a334923ee84add34eca 4
2178 89 OMIM_Expanded http://www.omim.org/downloads 187 27eca242904d8e12a38cf8881395bc50d57a03e1 4
851 15 VirusMINT http://mint.bio.uniroma2.it/download.html 85 5abad1fc36216222b0420cadcd9be805a0dda63e 4
10061 106 MSigDB_Computational http://www.broadinstitute.org/gsea/msigdb/collections.jsp 858 e4cdcc7e259788fdf9b25586cce3403255637064 4
11250 166 MSigDB_Oncogenic_Signatures http://www.broadinstitute.org/gsea/msigdb/collections.jsp 189 c76f5319c33c4833c71db86a30d7e33cd63ff8cf 4
15406 300 Disease_Signatures_from_GEO_down_2014 http://www.ncbi.nlm.nih.gov/geo/ 142 aabdf7017ae55ae75a004270924bcd336653b986 7
17711 300 Virus_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 323 45268b7fc680d05dd9a29743c2f2b2840a7620bf 4
17576 300 Virus_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 323 5f531580ccd168ee4acc18b02c6bdf8200e19d08 4
15797 176 Cancer_Cell_Line_Encyclopedia https://portals.broadinstitute.org/ccle/home 967 eb38dbc3fb20adafa9d6f9f0b0e36f378e75284f 5
12232 343 NCI-60_Cancer_Cell_Lines http://biogps.org/downloads/ 93 75c81676d8d6d99d262c9660edc024b78cfb07c9 5
13572 301 Tissue_Protein_Expression_from_ProteomicsDB https://www.proteomicsdb.org/ 207 7
6454 301 Tissue_Protein_Expression_from_Human_Proteome_Map http://www.humanproteomemap.org/index.php 30 49351dc989f9e6ca97c55f8aca7778aa3bfb84b9 5
3723 47 HMDB_Metabolites http://www.hmdb.ca/downloads 3906 1905132115d22e4119bce543bdacaab074edb363 6
7588 35 Pfam_InterPro_Domains ftp://ftp.ebi.ac.uk/pub/databases/interpro/ 311 e2b4912cfb799b70d87977808c54501544e4cdc9 6
7682 78 GO_Biological_Process_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 941 5216d1ade194ffa5a6c00f105e2b1899f64f45fe 7
7324 172 GO_Cellular_Component_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 205 fd1332a42395e0bc1dba82868b39be7983a48cc5 7
8469 122 GO_Molecular_Function_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 402 7e3e99e5aae02437f80b0697b197113ce3209ab0 7
13121 305 Allen_Brain_Atlas_up http://www.brain-map.org/ 2192 3804715a63a308570e47aa1a7877f01147ca6202 5
26382 1811 ENCODE_TF_ChIP-seq_2015 http://genome.ucsc.edu/ENCODE/downloads.html 816 56b6adb4dc8a2f540357ef992d6cd93dfa2907e5 1
29065 2123 ENCODE_Histone_Modifications_2015 http://genome.ucsc.edu/ENCODE/downloads.html 412 55b56cd8cf2ff04b26a09b9f92904008b82f3a6f 1
280 9 Phosphatase_Substrates_from_DEPOD http://www.koehn.embl.de/depod/ 59 d40701e21092b999f4720d1d2b644dd0257b6259 2
13877 304 Allen_Brain_Atlas_down http://www.brain-map.org/ 2192 ea67371adec290599ddf484ced2658cfae259304 5
15852 912 ENCODE_Histone_Modifications_2013 http://genome.ucsc.edu/ENCODE/downloads.html 109 c209ae527bc8e98e4ccd27a668d36cd2c80b35b4 7
4320 129 Achilles_fitness_increase http://www.broadinstitute.org/achilles 216 98366496a75f163164106e72439fb2bf2f77de4e 4
4271 128 Achilles_fitness_decrease http://www.broadinstitute.org/achilles 216 83a710c1ff67fd6b8af0d80fa6148c40dbd9bc64 4
10496 201 MGI_Mammalian_Phenotype_2013 http://www.informatics.jax.org/ 476 a4c6e217a81a4a58ff5a1c9fc102b70beab298e9 7
1678 21 BioCarta_2015 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 239 70e4eb538daa7688691acfe5d9c3c19022be832b 7
756 12 HumanCyc_2015 http://humancyc.org/ 125 711f0c02b23f5e02a01207174943cfeee9d3ea9c 7
3800 48 KEGG_2015 http://www.kegg.jp/kegg/download/ 179 e80d25c56de53c704791ddfdc6ab5eec28ae7243 7
2541 39 NCI-Nature_2015 http://pid.nci.nih.gov/ 209 47edfc012bcbb368a10b717d8dca103f7814b5a4 7
1918 39 Panther_2015 http://www.pantherdb.org/ 104 ab824aeeff0712bab61f372e43aebb870d1677a9 7
5863 51 WikiPathways_2015 http://www.wikipathways.org/index.php/Download_Pathways 404 1f7eea2f339f37856522c1f1c70ec74c7b25325f 7
6768 47 Reactome_2015 http://www.reactome.org/download/index.html 1389 36e541bee015eddb8d53827579549e30fe7a3286 7
25651 807 ESCAPE http://www.maayanlab.net/ESCAPE/ 315 a7acc741440264717ff77751a7e5fed723307835 5
19129 1594 HomoloGene http://www.ncbi.nlm.nih.gov/homologene 12 663b665b75a804ef98add689f838b68e612f0d2a 6
23939 293 Disease_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 839 0f412e0802d76efa0374504c2c9f5e0624ff7f09 8
23561 307 Disease_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 839 9ddc3902fb01fb9eaf1a2a7c2ff3acacbb48d37e 8
23877 302 Drug_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 906 068623a05ecef3e4a5e0b4f8db64bb8faa3c897f 8
15886 9 Genes_Associated_with_NIH_Grants https://grants.nih.gov/grants/oer.htm 32876 76fc5ec6735130e287e62bae6770a3c5ee068645 6
24350 299 Drug_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 906 c9c2155b5ac81ac496854fa61ba566dcae06cc80 8
3102 25 KEA_2015 http://amp.pharm.mssm.edu/Enrichr 428 18a081774e6e0aaf60b1a4be7fd20afcf9e08399 2
31132 298 Gene_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 2460 53dedc29ce3100930d68e506f941ef59de05dc6b 8
30832 302 Gene_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 2460 499882af09c62dd6da545c15cb51c1dc5e234f78 8
48230 1429 ChEA_2015 http://amp.pharm.mssm.edu/Enrichr 395 712eb7b6edab04658df153605ec6079fa89fb5c7 7
5613 36 dbGaP http://www.ncbi.nlm.nih.gov/gap 345 010f1267055b1a1cb036e560ea525911c007a666 4
9559 73 LINCS_L1000_Chem_Pert_up https://clue.io/ 33132 5e678b3debe8d8ea95187d0cd35c914017af5eb3 7
9448 63 LINCS_L1000_Chem_Pert_down https://clue.io/ 33132 fedbf5e221f45ee60ebd944f92569b5eda7f2330 7
16725 1443 GTEx_Tissue_Expression_Down http://www.gtexportal.org/ 2918 74b818bd299a9c42c1750ffe43616aa9f7929f02 5
19249 1443 GTEx_Tissue_Expression_Up http://www.gtexportal.org/ 2918 103738763d89cae894bec9f145ac28167a90e611 5
15090 282 Ligand_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 261 1eb3c0426140340527155fd0ef67029db2a72191 8
16129 292 Aging_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 286 cd95fe1b505ba6f28cd722cfba50fdea979d3b4c 8
15309 308 Aging_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 286 74c4f0a0447777005b2a5c00c9882a56dfc62d7c 8
15103 318 Ligand_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 261 31baa39da2931ddd5f7aedf2d0bbba77d2ba7b46 8
15022 290 MCF7_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 401 555f68aef0a29a67b614a0d7e20b6303df9069c6 8
15676 310 MCF7_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 401 1bc2ba607f1ff0dda44e2a15f32a2c04767da18c 8
15854 279 Microbe_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 312 9e613dba78ef7e60676b13493a9dc49ccd3c8b3f 8
15015 321 Microbe_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 312 d0c3e2a68e8c611c669098df2c87b530cec3e132 8
3788 159 LINCS_L1000_Ligand_Perturbations_down https://clue.io/ 96 957846cb05ef31fc8514120516b73cc65af7980e 7
3357 153 LINCS_L1000_Ligand_Perturbations_up https://clue.io/ 96 3bd494146c98d8189898a947f5ef5710f1b7c4b2 7
12668 300 L1000_Kinase_and_GPCR_Perturbations_down https://clue.io/ 3644 1ccc5bce553e0c2279f8e3f4ddcfbabcf566623b 7
12638 300 L1000_Kinase_and_GPCR_Perturbations_up https://clue.io/ 3644 b54a0d4ba525eac4055c7314ca9d9312adcb220c 7
8973 64 Reactome_2016 http://www.reactome.org/download/index.html 1530 1f54638e8f45075fb79489f0e0ef906594cb0678 7
7010 87 KEGG_2016 http://www.kegg.jp/kegg/download/ 293 43f56da7540195ba3c94eb6e34c522a699b36da9 7
5966 51 WikiPathways_2016 http://www.wikipathways.org/index.php/Download_Pathways 437 340be98b444cad50bb974df69018fd598e23e5e1 7
15562 887 ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X 104 5426f7747965c23ef32cff46fabf906e2cd76bfa 1
17850 300 Kinase_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 285 bb9682d78b8fc43be842455e076166fcd02cefc3 2
17660 300 Kinase_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 285 78618915009cac3a0663d6f99d359e39a31b6660 2
1348 19 BioCarta_2016 http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 237 13d9ab18921d5314a5b2b366f6142b78ab0ff6aa 2
934 13 HumanCyc_2016 http://humancyc.org/ 152 d6a502ef9b4c789ed5e73ca5a8de372796e5c72a 2
2541 39 NCI-Nature_2016 http://pid.nci.nih.gov/ 209 3c1e1f7d1a651d9aaa198e73704030716fc09431 2
2041 42 Panther_2016 http://www.pantherdb.org/pathway/ 112 ca5f6abf7f75d9baae03396e84d07300bf1fd051 2
5209 300 DrugMatrix https://ntp.niehs.nih.gov/drugmatrix/ 7876 255c3db820d612f34310f22a6985dad50e9fe1fe 4
49238 1550 ChEA_2016 http://amp.pharm.mssm.edu/Enrichr 645 af271913344aa08e6a755af1d433ef15768d749a 7
2243 19 huMAP http://proteincomplexes.org/ 995 249247d2f686d3eb4b9e4eb976c51159fac80a89 2
19586 545 Jensen_TISSUES http://tissues.jensenlab.org/ 1842 e8879ab9534794721614d78fe2883e9e564d7759 3
22440 505 RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO http://www.ncbi.nlm.nih.gov/geo/ 1302 f0752e4d7f5198f86446678966b260c530d19d78 8
8184 24 MGI_Mammalian_Phenotype_2017 http://www.informatics.jax.org/ 5231 0705e59bff98deda6e9cbe00cfcdd871c85e7d04 7
18329 161 Jensen_COMPARTMENTS http://compartments.jensenlab.org/ 2283 56ec68c32d4e83edc2ee83bea0e9f6a3829b2279 3
15755 28 Jensen_DISEASES http://diseases.jensenlab.org/ 1811 3045dff8181367c1421627bb8e4c5a32c6d67f98 3
10271 22 BioPlex_2017 http://bioplex.hms.harvard.edu/ 3915 b8620b1a9d0d271d1a2747d8cfc63589dba39991 2
10427 38 GO_Cellular_Component_2017 http://www.geneontology.org/ 636 8fed21d22dfcc3015c05b31d942fdfc851cc8e04 7
10601 25 GO_Molecular_Function_2017 http://www.geneontology.org/ 972 b4018906e0a8b4e81a1b1afc51e0a2e7655403eb 7
13822 21 GO_Biological_Process_2017 http://www.geneontology.org/ 3166 d9da4dba4a3eb84d4a28a3835c06dfbbe5811f92 7
8002 143 GO_Cellular_Component_2017b http://www.geneontology.org/ 816 ecf39c41fa5bc7deb625a2b5761a708676e9db7c 7
10089 45 GO_Molecular_Function_2017b http://www.geneontology.org/ 3271 8d8340361dd36a458f1f0a401f1a3141de1f3200 7
13247 49 GO_Biological_Process_2017b http://www.geneontology.org/ 10125 6404c38bffc2b3732de4e3fbe417b5043009fe34 7
21809 2316 ARCHS4_Tissues http://amp.pharm.mssm.edu/archs4 108 4126374338235650ab158ba2c61cd2e2383b70df 5
23601 2395 ARCHS4_Cell-lines http://amp.pharm.mssm.edu/archs4 125 5496ef9c9ae9429184d0b9485c23ba468ee522a8 5
20883 299 ARCHS4_IDG_Coexp http://amp.pharm.mssm.edu/archs4 352 ce60be284fdd5a9fc6240a355421a9e12b1ee84a 4
19612 299 ARCHS4_Kinases_Coexp http://amp.pharm.mssm.edu/archs4 498 6721c5ed97b7772e4a19fdc3f797110df0164b75 2
25983 299 ARCHS4_TFs_Coexp http://amp.pharm.mssm.edu/archs4 1724 8a468c3ae29fa68724f744cbef018f4f3b61c5ab 1
19500 137 SysMyo_Muscle_Gene_Sets http://sys-myo.rhcloud.com/ 1135 8
14893 128 miRTarBase_2017 http://mirtarbase.mbc.nctu.edu.tw/ 3240 6b7c7fe2a97b19aecbfba12d8644af6875ad99c4 1
17598 1208 TargetScan_microRNA_2017 http://www.targetscan.org/ 683 79d13fb03d2fa6403f9be45c90eeda0f6822e269 1
5902 109 Enrichr_Libraries_Most_Popular_Genes http://amp.pharm.mssm.edu/Enrichr 121 e9b7d8ee237d0a690bd79d970a23a9fa849901ed 6
12486 299 Enrichr_Submissions_TF-Gene_Coocurrence http://amp.pharm.mssm.edu/Enrichr 1722 be2ca8ef5a8c8e17d7e7bd290e7cbfe0951396c0 1
1073 100 Data_Acquisition_Method_Most_Popular_Genes http://amp.pharm.mssm.edu/Enrichr 12 17ce5192b9eba7d109b6d228772ea8ab222e01ef 6
19513 117 DSigDB http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ 4026 287476538ab98337dbe727b3985a436feb6d192a 4
14433 36 GO_Biological_Process_2018 http://www.geneontology.org/ 5103 b5b77681c46ac58cd050e60bcd4ad5041a9ab0a9 7
8655 61 GO_Cellular_Component_2018 http://www.geneontology.org/ 446 e9ebe46188efacbe1056d82987ff1c70218fa7ae 7
11459 39 GO_Molecular_Function_2018 http://www.geneontology.org/ 1151 79ff80ae9a69dd00796e52569e41422466fa0bee 7
19741 270 TF_Perturbations_Followed_by_Expression http://www.ncbi.nlm.nih.gov/geo/ 1958 34d08a4878c19584aaf180377f2ea96faa6a6eb1 1
27360 802 Chromosome_Location_hg19 http://hgdownload.cse.ucsc.edu/downloads.html 36 fdab39c467ba6b0fb0288df1176d7dfddd7196d5 6
13072 26 NIH_Funded_PIs_2017_Human_GeneRIF https://www.ncbi.nlm.nih.gov/pubmed/ 5687 859b100fac3ca774ad84450b1fbb65a78fcc6b12 6
13464 45 NIH_Funded_PIs_2017_Human_AutoRIF https://www.ncbi.nlm.nih.gov/pubmed/ 12558 fc5bf033b932cf173633e783fc8c6228114211f8 6
13787 200 Rare_Diseases_AutoRIF_ARCHS4_Predictions https://amp.pharm.mssm.edu/geneshot/ 3725 375ff8cdd64275a916fa24707a67968a910329bb 4
13929 200 Rare_Diseases_GeneRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/gene/about-generif 2244 0f7fb7f347534779ecc6c87498e96b5460a8d652 4
16964 200 NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/pubmed/ 12558 f77de51aaf0979dd6f56381cf67ba399b4640d28 6
17258 200 NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/pubmed/ 5684 25fa899b715cd6a9137f6656499f89cd25144029 6
10352 58 Rare_Diseases_GeneRIF_Gene_Lists https://www.ncbi.nlm.nih.gov/gene/about-generif 2244 0fb9ac92dbe52024661c088f71a1134f00567a8b 4
10471 76 Rare_Diseases_AutoRIF_Gene_Lists https://amp.pharm.mssm.edu/geneshot/ 3725 ee3adbac2da389959410260b280e7df1fd3730df 4
12419 491 SubCell_BarCode http://www.subcellbarcode.org/ 104 b50bb9480d8a77103fb75b331fd9dd927246939a 2
19378 37 GWAS_Catalog_2019 https://www.ebi.ac.uk/gwas 1737 fef3864bcb5dd9e60cee27357eff30226116c49b 7
6201 45 WikiPathways_2019_Human https://www.wikipathways.org/ 472 b0c9e9ebb9014f14561e896008087725a2db24b7 7
4558 54 WikiPathways_2019_Mouse https://www.wikipathways.org/ 176 e7750958da20f585c8b6d5bc4451a5a4305514ba 7
3264 22 TRRUST_Transcription_Factors_2019 https://www.grnpedia.org/trrust/ 571 5f8cf93e193d2bcefa5a37ccdf0eefac576861b0 1
7802 92 KEGG_2019_Human https://www.kegg.jp/ 308 3477bc578c4ea5d851dcb934fe2a41e9fd789bb4 7
8551 98 KEGG_2019_Mouse https://www.kegg.jp/ 303 187eb44b2d6fa154ebf628eba1f18537f64e797c 7
12444 23 InterPro_Domains_2019 https://www.ebi.ac.uk/interpro/ 1071 18dd5ec520fdf589a93d6a7911289c205e1ddf22 6
9000 20 Pfam_Domains_2019 https://pfam.xfam.org/ 608 a6325ed264f9ac9e6518796076c46a1d885cca7a 6
7744 363 DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 https://depmap.org/ 558 0b08b32b20854ac8a738458728a9ea50c2e04800 4
6204 387 DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 https://depmap.org/ 325 b7c4ead26d0eb64f1697c030d31682b581c8bb56 4
13420 32 MGI_Mammalian_Phenotype_Level_4_2019 http://www.informatics.jax.org/ 5261 f1bed632e89ebc054da44236c4815cdce03ef5ee 7
14148 122 UK_Biobank_GWAS_v1 https://www.ukbiobank.ac.uk/tag/gwas/ 857 958fb52e6215626673a5acf6e9289a1b84d11b4a 4
9813 49 BioPlanet_2019 https://tripod.nih.gov/bioplanet/ 1510 e110851dfc763d30946f2abedcc2cd571ac357a0 2
1397 13 ClinVar_2019 https://www.ncbi.nlm.nih.gov/clinvar/ 182 0a95303f8059bec08836ecfe02ce3da951150547 4
9116 22 PheWeb_2019 http://pheweb.sph.umich.edu/ 1161 6a7c7321b6b72c5285b722f7902d26a2611117cb 4
17464 63 DisGeNET https://www.disgenet.org 9828 3c261626478ce9e6bf2c7f0a8014c5e901d43dc0 4
394 73 HMS_LINCS_KinomeScan http://lincs.hms.harvard.edu/kinomescan/ 148 47ba06cdc92469ac79400fc57acd84ba343ba616 2
11851 586 CCLE_Proteomics_2020 https://portals.broadinstitute.org/ccle 378 7094b097ae2301a1d6a5bd856a193b084cca993d 5
8189 421 ProteomicsDB_2020 https://www.proteomicsdb.org/ 913 8c87c8346167bac2ba68195a32458aba9b1acfd1 5
18704 100 lncHUB_lncRNA_Co-Expression https://amp.pharm.mssm.edu/lnchub/ 3729 45b597d7efa5693b7e4172b09c0ed2dda3305582 1
5605 39 Virus-Host_PPI_P-HIPSTer_2020 http://phipster.org/ 6715 a592eed13e8e9496aedbab63003b965574e46a65 2
5718 31 Elsevier_Pathway_Collection http://www.transgene.ru/disease-pathways/ 1721 9196c760e3bcae9c9de1e3f87ad81f96bde24325 2
14156 40 Table_Mining_of_CRISPR_Studies 802 ad580f3864fa8ff69eaca11f6d2e7f9b86378d08 6
16979 295 COVID-19_Related_Gene_Sets https://amp.pharm.mssm.edu/covid19 205 72b0346849570f66a77a6856722601e711596cb4 7
4383 146 MSigDB_Hallmark_2020 https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp 50 6952efda94663d4bd8db09bf6eeb4e67d21ef58c 2
54974 483 Enrichr_Users_Contributed_Lists_2020 https://maayanlab.cloud/Enrichr 1482 8dc362703b38b30ac3b68b6401a9b20a58e7d3ef 6
12118 448 TG_GATES_2020 https://toxico.nibiohn.go.jp/english/ 1190 9e32560437b11b4628b00ccf3d584360f7f7daee 4
12361 124 Allen_Brain_Atlas_10x_scRNA_2021 https://portal.brain-map.org/ 766 46f8235cb585829331799a71aec3f7c082170219 5
9763 139 Descartes_Cell_Types_and_Tissue_2021 https://descartes.brotmanbaty.org/bbi/human-gene-expression-during-development/ 172 5
8078 102 KEGG_2021_Human https://www.kegg.jp/ 320 2
7173 43 WikiPathway_2021_Human https://www.wikipathways.org/ 622 2
5833 100 HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression https://hubmapconsortium.github.io/ccf-asct-reporter/ 344 5
14937 33 GO_Biological_Process_2021 http://www.geneontology.org/ 6036 7
11497 80 GO_Cellular_Component_2021 http://www.geneontology.org/ 511 7
11936 34 GO_Molecular_Function_2021 http://www.geneontology.org/ 1274 7
9767 33 MGI_Mammalian_Phenotype_Level_4_2021 http://www.informatics.jax.org/ 4601 3
14167 80 CellMarker_Augmented_2021 http://biocc.hrbmu.edu.cn/CellMarker/ 1097 5
17851 102 Orphanet_Augmented_2021 http://www.orphadata.org/ 3774 4
16853 360 COVID-19_Related_Gene_Sets_2021 https://maayanlab.cloud/covid19/ 478 4
6654 136 PanglaoDB_Augmented_2021 https://panglaodb.se/ 178 5
1683 10 Azimuth_Cell_Types_2021 https://azimuth.hubmapconsortium.org/ 341 5
20414 112 PhenGenI_Association_2021 https://www.ncbi.nlm.nih.gov/gap/phegeni 950 4
26076 250 RNAseq_Automatic_GEO_Signatures_Human_Down https://maayanlab.cloud/archs4/ 4269 8
26338 250 RNAseq_Automatic_GEO_Signatures_Human_Up https://maayanlab.cloud/archs4/ 4269 8
25381 250 RNAseq_Automatic_GEO_Signatures_Mouse_Down https://maayanlab.cloud/archs4/ 4216 8
25409 250 RNAseq_Automatic_GEO_Signatures_Mouse_Up https://maayanlab.cloud/archs4/ 4216 8
11980 250 GTEx_Aging_Signatures_2021 https://gtexportal.org/ 270 4
31158 805 HDSigDB_Human_2021 https://www.hdinhd.org/ 2564 4
30006 815 HDSigDB_Mouse_2021 https://www.hdinhd.org/ 2579 4
13370 103 HuBMAP_ASCTplusB_augmented_2022 https://hubmapconsortium.github.io/ccf-asct-reporter/ 777 5
13697 343 FANTOM6_lncRNA_KD_DEGs https://fantom.gsc.riken.jp/6/ 350 1
2183 18 MAGMA_Drugs_and_Diseases https://github.com/nybell/drugsets/tree/main/DATA/GENESETS 1395 4
12765 13 PFOCR_Pathways https://pfocr.wikipathways.org/ 17326 7
1509 100 Tabula_Sapiens https://tabula-sapiens-portal.ds.czbiohub.org/ 469 5
18365 1214 ChEA_2022 https://maayanlab.cloud/chea3/ 757 1
13525 175 Diabetes_Perturbations_GEO_2022 https://appyters.maayanlab.cloud/#/Gene_Expression_T2D_Signatures 601 4
9525 245 LINCS_L1000_Chem_Pert_Consensus_Sigs https://maayanlab.cloud/sigcom-lincs/#/Download 10850 4
9440 245 LINCS_L1000_CRISPR_KO_Consensus_Sigs https://maayanlab.cloud/sigcom-lincs/#/Download 10424 4
3857 80 Tabula_Muris https://tabula-muris.ds.czbiohub.org/ 106 5
10489 61 Reactome_2022 https://reactome.org/download-data 1818 2
1198 23 SynGO_2022 https://www.syngoportal.org/ 118 3
1882 47 GlyGen_Glycosylated_Proteins_2022 https://www.glygen.org/ 338 2
1552 16 IDG_Drug_Targets_2022 https://drugcentral.org/ 888 4
6713 68 KOMP2_Mouse_Phenotypes_2022 https://www.mousephenotype.org/ 529 3
936 15 Metabolomics_Workbench_Metabolites_2022 https://www.metabolomicsworkbench.org/ 233 2
8220 146 Proteomics_Drug_Atlas_2023 https://www.nature.com/articles/s41587-022-01539-0 1748 4
9021 793 The_Kinase_Library_2023 https://kinase-library.phosphosite.org/site 303 2
8076 96 GTEx_Tissues_V8_2023 https://gtexportal.org/home/ 511 5
14698 33 GO_Biological_Process_2023 http://www.geneontology.org/ 5407 3
10972 85 GO_Cellular_Component_2023 http://www.geneontology.org/ 474 3
12126 38 GO_Molecular_Function_2023 http://www.geneontology.org/ 1147 3
13662 12 PFOCR_Pathways_2023 https://pfocr.wikipathways.org/ 21845 2
18290 34 GWAS_Catalog_2023 https://www.ebi.ac.uk/gwas 5271 4
12081 50 GeDiPNet_2023 http://gedipnet.bicnirrh.res.in/ 2388 4
12853 485 MAGNET_2023 https://magnet-winterlab.herokuapp.com/ 72 5
markers_enriched=list()
if (markers_found) {
  # Upregulated markers
  
  # Convert Seurat names of upregulated marker per cluster to Entrez; use named lists for translation
  # Is that still neccessary?
  marker_genesets_up = sapply(levels(sc$seurat_clusters), function(x) {
    tmp = markers_filt$up %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
    tmp = sapply(tmp, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
    return(tmp[!is.na(tmp)])
  }, USE.NAMES=TRUE, simplify=TRUE)
  
  # Tests done by Enrichr
  marker_genesets_up_enriched = purrr::map(marker_genesets_up, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  marker_genesets_up_enriched = purrr::map(list_names(marker_genesets_up_enriched), function(n) {
    return(purrr::map(marker_genesets_up_enriched[[n]], function(d){
      return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("up", nrow(d))))
    }))
  })

  # Write to files
  invisible(purrr::map(names(marker_genesets_up_enriched), function(n) {
    EnrichrWriteResults(enrichr_results=marker_genesets_up_enriched[[n]],
                        file=file.path(param$path_out, "marker_degs", paste0("functions_marker_up_cluster_", n, "_vs_rest.xlsx")))
  }))
  
  
  # Downregulated markers
  
  # Convert Seurat names of downregulated marker per cluster to Entrez; use named lists for translation
  # Is that still neccessary?
  marker_genesets_down = sapply(levels(sc$seurat_clusters), function(x) {
    tmp = markers_filt$down %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
    tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
    return(tmp[!is.na(tmp)])
  }, USE.NAMES=TRUE, simplify=TRUE)
  
  #  Tests done by Enrichr
  marker_genesets_down_enriched = purrr::map(marker_genesets_down, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  marker_genesets_down_enriched = purrr::map(list_names(marker_genesets_down_enriched), function(n) {
    return(purrr::map(marker_genesets_down_enriched[[n]], function(d){
      return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("down", nrow(d))))
    }))
  })
  
  # Write to files
  invisible(purrr::map(names(marker_genesets_down_enriched), function(n) {
    EnrichrWriteResults(enrichr_results=marker_genesets_down_enriched[[n]],
                        file=file.path(param$path_out, "marker_degs", paste0("functions_marker_down_cluster_", n, "_vs_rest.xlsx")))
  }))
  
  # Combine, flatten into data.frame and add to sc misc slot
  marker_genesets_enriched = c(marker_genesets_up_enriched, marker_genesets_down_enriched)
  marker_genesets_enriched = unname(marker_genesets_enriched)
  marker_genesets_enriched = purrr::map(marker_genesets_enriched, FlattenEnrichr) %>% dplyr::bind_rows()
  marker_genesets_enriched$Cluster = factor(marker_genesets_enriched$Cluster, levels=levels(sc$seurat_clusters))
  marker_genesets_enriched$Direction = factor(marker_genesets_enriched$Direction, levels=c("up", "down"))
  
  misc_content = Misc(sc, "markers")
  misc_content[["enrichr"]] = marker_genesets_enriched
  suppressWarnings({Misc(sc, "markers") = misc_content})
}

The following table contains the top enriched terms per cluster.

# Top enriched terms (TODO: better plots, functions)
if (markers_found) {
  cat("#### {.tabset} \n \n")
  
  # Get top ten up and down over all databases per cluster
  marker_genesets_top_enriched = marker_genesets_enriched %>% dplyr::group_by(Cluster, Direction) %>%
    dplyr::top_n(n=10, wt=Combined.Score)
  
  # Print as tabs
  for(n in levels(marker_genesets_top_enriched$Cluster)){
    cat("##### ", n, " \n")
    
    print(knitr::kable(marker_genesets_top_enriched %>% dplyr::ungroup() %>% dplyr::filter(Cluster==n) %>% dplyr::select(Database, Term, Direction, Adjusted.P.value, Odds.Ratio, Combined.Score), 
                     align="l", caption="Top ten enriched terms per geneset", format="html") %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    kableExtra::scroll_box(width="100%", height="700px"))

    cat(" \n \n")
  }
  cat(" \n \n")
}

1
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
Azimuth_Cell_Types_2021 CD8 T CL0000625 up 0.0000000 7265.09091 389297.056
Azimuth_Cell_Types_2021 CD8+ Effector Memory T CL0001050 up 0.0000000 3884.61111 175529.070
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 4 CL0001050 up 0.0000000 3884.61111 175529.070
Azimuth_Cell_Types_2021 CD4+ Cytotoxic T CL0000934 up 0.0000000 1942.01389 82587.940
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 1 CL0000939 up 0.0000000 2305.03846 85950.828
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 2 CL0001050 up 0.0000000 2305.03846 85950.828
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 3 CL0001050 up 0.0000000 2305.03846 85950.828
Azimuth_Cell_Types_2021 Natural Killer Cell CL0000623 up 0.0000000 2305.03846 85950.828
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 1 CL0001050 up 0.0000000 2378.33333 74549.198
CellMarker_Augmented_2021 CD8+ T cell:Peripheral Blood up 0.0000000 2663.86667 67839.353
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) down 0.0000022 184.70370 3236.665
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) down 0.0000023 147.74074 2477.893
GO_Biological_Process_2018 immunoglobulin mediated immune response (GO:0016064) down 0.0061894 175.03509 1598.057
GO_Biological_Process_2018 B cell mediated immunity (GO:0019724) down 0.0061894 175.03509 1598.057
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) down 0.0000903 147.04423 1887.825
GO_Cellular_Component_2018 autolysosome (GO:0044754) down 0.0007369 175.03509 1598.057
Azimuth_Cell_Types_2021 Dendritic Cell CL0000451 down 0.0000001 369.51852 7270.066
Azimuth_Cell_Types_2021 Monocyte CL0000576 down 0.0000001 369.51852 7270.066
Azimuth_Cell_Types_2021 CD16+ Monocyte CL0002396 down 0.0000010 138.50000 2292.202
Azimuth_Cell_Types_2021 Nonclassical Monocyte CL0000875 down 0.0000079 231.11583 3222.357
Azimuth_Cell_Types_2021 Immune CL0000738 down 0.0000079 231.11583 3222.357
CellMarker_Augmented_2021 Monocyte:Fetal Kidney down 0.0000000 51.76124 3350.114
2
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
Azimuth_Cell_Types_2021 CD4+ Effector Memory T 1 CL0001044 up 0.0000000 3331.00000 89408.362
Azimuth_Cell_Types_2021 Mucosal Associated Invariant T CL0000940 up 0.0000000 1998.20000 50514.999
Azimuth_Cell_Types_2021 Other T Cell CL0002419 up 0.0000000 1713.00000 32730.628
Azimuth_Cell_Types_2021 CD4+ Effector Memory T CL0001044 up 0.0000000 1713.00000 32730.628
Azimuth_Cell_Types_2021 CD8+ Central Memory T 1 CL0000909 up 0.0000000 1713.00000 32730.628
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 1 CL0001050 up 0.0000189 1110.33333 13824.364
Azimuth_Cell_Types_2021 CD8+ Memory/Effector T CL0000625 up 0.0000189 832.66667 9972.499
Azimuth_Cell_Types_2021 Double-negative T 1 CL0000935 up 0.0000189 832.66667 9972.499
Azimuth_Cell_Types_2021 CD4+ Effector Memory T 3 CL0001044 up 0.0000189 832.66667 9972.499
Azimuth_Cell_Types_2021 CD4+ Effector Memory T 4 CL0001044 up 0.0000189 832.66667 9972.499
Azimuth_Cell_Types_2021 Gamma-Delta T 1 CL0000798 up 0.0000189 832.66667 9972.499
Azimuth_Cell_Types_2021 CD8+ Central Memory T 3 CL0000909 up 0.0000189 832.66667 9972.499
CellMarker_Augmented_2021 CD8+ T cell:Kidney up 0.0001628 1110.33333 13824.364
CellMarker_Augmented_2021 Natural Killer T (NKT) cell:Fetal Kidney up 0.0001628 123656.00000 1466806.376
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) down 0.0000009 229.36782 4202.872
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) down 0.0000010 183.46667 3223.734
GO_Biological_Process_2018 immunoglobulin mediated immune response (GO:0016064) down 0.0034222 214.63441 2043.045
GO_Biological_Process_2018 B cell mediated immunity (GO:0019724) down 0.0034222 214.63441 2043.045
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) down 0.0000004 275.26897 5207.882
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) down 0.0000007 196.58128 3500.416
Azimuth_Cell_Types_2021 Dendritic Cell CL0000451 down 0.0000000 458.87356 9396.024
Azimuth_Cell_Types_2021 Monocyte CL0000576 down 0.0000000 458.87356 9396.024
Azimuth_Cell_Types_2021 CD16+ Monocyte CL0002396 down 0.0000002 171.99138 2983.919
Azimuth_Cell_Types_2021 Immune CL0000738 down 0.0000028 285.14286 4144.429
CellMarker_Augmented_2021 Monocyte:Fetal Kidney down 0.0000000 57.03346 3220.386
3
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
Azimuth_Cell_Types_2021 CD4+ Naive T CL0000895 up 0.0000000 1331.8000 31866.267
Azimuth_Cell_Types_2021 CD4+ Central Memory T CL0000897 up 0.0000000 1713.0000 32730.628
Azimuth_Cell_Types_2021 CD4+ T Cell CL0000624 up 0.0000000 1713.0000 32730.628
Azimuth_Cell_Types_2021 CD4+ Central Memory T 1 CL0000897 up 0.0000182 832.6667 9972.499
Azimuth_Cell_Types_2021 CD4+ Central Memory T 3 CL0000897 up 0.0000182 832.6667 9972.499
Azimuth_Cell_Types_2021 CD4+ Effector Memory T CL0001044 up 0.0000182 832.6667 9972.499
Azimuth_Cell_Types_2021 CD4+ Effector Memory T 3 CL0001044 up 0.0000182 832.6667 9972.499
Azimuth_Cell_Types_2021 CD8+ Central Memory T CL0000909 up 0.0000182 832.6667 9972.499
Azimuth_Cell_Types_2021 CD8+ Central Memory T 2 CL0000909 up 0.0000182 832.6667 9972.499
CellMarker_Augmented_2021 Natural Killer T (NKT) cell:Fetal Kidney up 0.0005043 123656.0000 1466806.376
Azimuth_Cell_Types_2021 Natural Killer Cell CL0000623 down 0.0000000 1697.2766 74510.813
Azimuth_Cell_Types_2021 CD56-dim Natural Killer CL0000939 down 0.0000000 969.4028 35760.597
Azimuth_Cell_Types_2021 CD4+ Cytotoxic T CL0000934 down 0.0000000 484.6285 16591.385
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 1 CL0000939 down 0.0000000 610.4388 18508.636
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 2 CL0000939 down 0.0000000 610.4388 18508.636
Azimuth_Cell_Types_2021 CD8+ Effector Memory T CL0001050 down 0.0000000 610.4388 18508.636
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 4 CL0001050 down 0.0000000 610.4388 18508.636
Azimuth_Cell_Types_2021 CD8 T CL0000625 down 0.0000000 398.8000 9630.437
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 2 CL0001050 down 0.0000000 398.8000 9630.437
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 3 CL0001050 down 0.0000000 398.8000 9630.437
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 5 CL0001050 down 0.0000000 398.8000 9630.437
Azimuth_Cell_Types_2021 Dendritic Cell CL0000451 down 0.0000000 398.8000 9630.437
4
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
Azimuth_Cell_Types_2021 CD14+ Monocyte CL0002057 up 0 568.67078 32160.905
Azimuth_Cell_Types_2021 Monocyte CL0000576 up 0 197430.00000 8631653.654
Azimuth_Cell_Types_2021 Classical Monocyte CL0000860 up 0 317.12450 9881.594
CellMarker_Augmented_2021 Monocyte:Fetal Kidney up 0 102.33851 53308.352
CellMarker_Augmented_2021 Macrophage:Synovium up 0 90.96041 13823.530
CellMarker_Augmented_2021 Monocyte:Bone Marrow up 0 83.14735 11893.633
CellMarker_Augmented_2021 Monocyte:Lung up 0 79.48868 11020.671
CellMarker_Augmented_2021 Monocyte:Plasma up 0 77.39387 10438.754
CellMarker_Augmented_2021 Monocyte:Synovium up 0 77.39387 10438.754
CellMarker_Augmented_2021 Monocyte:Liver up 0 75.98285 10203.802
GO_Biological_Process_2018 SRP-dependent cotranslational protein targeting to membrane (GO:0006614) down 0 160.51194 22361.534
GO_Biological_Process_2018 cotranslational protein targeting to membrane (GO:0006613) down 0 148.80830 20414.177
GO_Biological_Process_2018 protein targeting to ER (GO:0045047) down 0 138.69160 18747.268
GO_Biological_Process_2018 viral gene expression (GO:0019080) down 0 113.57548 14686.013
GO_Biological_Process_2018 nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) down 0 110.49471 14196.539
Azimuth_Cell_Types_2021 CD8 T CL0000625 down 0 1477.85950 64004.193
Azimuth_Cell_Types_2021 CD4+ Cytotoxic T CL0000934 down 0 369.40909 14427.359
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 4 CL0001050 down 0 651.40984 23913.733
Azimuth_Cell_Types_2021 CD8+ Memory/Effector T CL0000625 down 0 651.40984 23913.733
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 1 CL0001050 down 0 1130.75610 37701.000
5
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) up 0 815.8091 37955.34
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) up 0 453.1364 19640.15
Azimuth_Cell_Types_2021 B Cell CL0000785 up 0 4079.8636 211403.88
Azimuth_Cell_Types_2021 B CL0000236 up 0 1772.8889 78392.86
Azimuth_Cell_Types_2021 Dendritic Cell CL0000451 up 0 1772.8889 78392.86
Azimuth_Cell_Types_2021 Memory B Cell, Kappa Light Chain CL0000787 up 0 519.3490 13000.22
Azimuth_Cell_Types_2021 Intermediate B Cell, Lambda Light Chain CL0000785 up 0 415.4583 10112.50
Azimuth_Cell_Types_2021 Intermediate B Cell CL0000785 up 0 415.4583 10112.50
Azimuth_Cell_Types_2021 Memory B Cell CL0000787 up 0 415.4583 10112.50
Azimuth_Cell_Types_2021 Myeloid Dendritic Type 2 CL0000782 up 0 415.4583 10112.50
Azimuth_Cell_Types_2021 Naive B Cell CL0000788 up 0 415.4583 10112.50
CellMarker_Augmented_2021 B cell:Kidney up 0 117.8452 14062.70
CellMarker_Augmented_2021 B cell:Lymphoid Tissue up 0 1273.0851 41456.88
Azimuth_Cell_Types_2021 CD8+ Naive T CL0000900 down 0 298.2733 18742.11
Azimuth_Cell_Types_2021 CD4+ Cytotoxic T CL0000934 down 0 2091.8947 122973.41
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 1 CL0000939 down 0 1528.6923 66649.48
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 4 CL0001050 down 0 1528.6923 66649.48
Azimuth_Cell_Types_2021 CD4+ Effector Memory T 1 CL0001044 down 0 673.6271 24902.33
Azimuth_Cell_Types_2021 CD8 T CL0000625 down 0 673.6271 24902.33
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 3 CL0001050 down 0 673.6271 24902.33
Azimuth_Cell_Types_2021 Natural Killer Cell CL0000623 down 0 673.6271 24902.33
Azimuth_Cell_Types_2021 CD8+ Memory/Effector T CL0000625 down 0 673.6271 24902.33
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 1 CL0001050 down 0 1169.0000 39237.85
6
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
Azimuth_Cell_Types_2021 Natural Killer CL0000623 up 0.0000000 3118.1250 182826.896
Azimuth_Cell_Types_2021 Natural Killer Cell CL0000623 up 0.0000000 5442.8182 294524.343
Azimuth_Cell_Types_2021 CD56-dim Natural Killer CL0000939 up 0.0000000 2347.7647 108543.804
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 2 CL0000939 up 0.0000000 2347.7647 108543.804
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 1 CL0000939 up 0.0000000 1330.3333 51757.265
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 3 CL0000939 up 0.0000000 831.4167 26628.238
Azimuth_Cell_Types_2021 CD4+ Cytotoxic T CL0000934 up 0.0000000 475.0238 14218.164
CellMarker_Augmented_2021 CD4+ Cytotoxic T cell:Liver up 0.0000000 292.5000 26128.123
CellMarker_Augmented_2021 Natural Killer cell:Peripheral Blood up 0.0000000 494.5537 21957.191
CellMarker_Augmented_2021 Natural Killer cell:Kidney up 0.0000000 831.4167 26628.238
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) down 0.0000299 242.4899 3458.013
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) down 0.0000299 196.9934 2701.616
GO_Biological_Process_2018 polyamine metabolic process (GO:0006595) down 0.0000196 350.3333 5322.234
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) down 0.0000001 443.7333 9157.133
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) down 0.0000001 316.8889 6186.309
GO_Cellular_Component_2018 integral component of lumenal side of endoplasmic reticulum membrane (GO:0071556) down 0.0000004 177.3600 3100.376
GO_Cellular_Component_2018 autolysosome (GO:0044754) down 0.0001611 332.8667 3442.909
Azimuth_Cell_Types_2021 Dendritic Cell CL0000451 down 0.0000000 739.7037 16417.993
Azimuth_Cell_Types_2021 CD8+ Central Memory T 2 CL0000909 down 0.0000000 739.7037 16417.993
Azimuth_Cell_Types_2021 CD8+ Central Memory T CL0000909 down 0.0000022 450.4737 7115.967
7
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Biological_Process_2018 platelet degranulation (GO:0002576) up 0.0000000 23.02298 947.2029
GO_Biological_Process_2018 platelet aggregation (GO:0070527) up 0.0000000 45.00455 1153.0042
GO_Biological_Process_2018 homotypic cell-cell adhesion (GO:0034109) up 0.0000000 37.23574 901.6828
GO_Biological_Process_2018 polyamine metabolic process (GO:0006595) up 0.0004465 58.28824 749.7159
Azimuth_Cell_Types_2021 Platelet CL0000233 up 0.0000000 1337.88344 66937.5772
Azimuth_Cell_Types_2021 Platelet/Megakaryocyte CL0000763 up 0.0000000 166.16407 4435.6916
CellMarker_Augmented_2021 Endocrine cell:Pancreatic Islet up 0.0000000 27.44625 1148.7420
CellMarker_Augmented_2021 Cancer Stem cell:Mammary Gland up 0.0000000 24.08785 859.1621
Descartes_Cell_Types_and_Tissue_2021 Megakaryocytes in Liver up 0.0000000 20.97509 871.2766
Descartes_Cell_Types_and_Tissue_2021 Megakaryocytes in Adrenal up 0.0000000 21.72152 743.5401
GO_Biological_Process_2018 protein targeting to ER (GO:0045047) down 0.0000000 182.24257 44891.2671
GO_Biological_Process_2018 SRP-dependent cotranslational protein targeting to membrane (GO:0006614) down 0.0000000 247.07986 60495.2091
GO_Biological_Process_2018 cotranslational protein targeting to membrane (GO:0006613) down 0.0000000 185.27170 44013.1177
GO_Biological_Process_2018 nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) down 0.0000000 106.20561 24870.6104
GO_Biological_Process_2018 viral gene expression (GO:0019080) down 0.0000000 85.80431 17958.5209
GO_Biological_Process_2018 viral transcription (GO:0019083) down 0.0000000 78.83500 16237.3373
GO_Biological_Process_2018 cytoplasmic translation (GO:0002181) down 0.0000000 113.33989 13500.3213
GO_Cellular_Component_2018 cytosolic ribosome (GO:0022626) down 0.0000000 72.99795 15766.6845
GO_Cellular_Component_2018 cytosolic large ribosomal subunit (GO:0022625) down 0.0000000 83.18628 10999.5987
CellMarker_Augmented_2021 Natural Killer T (NKT) cell:Fetal Kidney down 0.0000000 37.43074 24018.7795
8
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
Azimuth_Cell_Types_2021 CD4+ Naive T CL0000895 up 0.0e+00 1848.9815 109314.734
Azimuth_Cell_Types_2021 CD8+ Naive T CL0000900 up 0.0e+00 308.9628 7386.442
Azimuth_Cell_Types_2021 CD4 T CL0000624 up 0.0e+00 739.7037 16417.993
Azimuth_Cell_Types_2021 CD4+ Central Memory T CL0000897 up 0.0e+00 739.7037 16417.993
Azimuth_Cell_Types_2021 CD4+ T Cell CL0000624 up 0.0e+00 739.7037 16417.993
CellMarker_Augmented_2021 Naive CD8+ T cell:Peripheral Blood up 0.0e+00 298.4700 15516.005
CellMarker_Augmented_2021 Naive CD4+ T cell:Peripheral Blood up 0.0e+00 600.6522 27675.408
CellMarker_Augmented_2021 Naive T cell:Undefined up 5.0e-07 1577.0526 28825.392
CellMarker_Augmented_2021 Central Memory T cell:Undefined up 1.2e-06 788.4474 13424.623
CellMarker_Augmented_2021 Effector Memory T cell:Undefined up 1.2e-06 788.4474 13424.623
Azimuth_Cell_Types_2021 CD4+ Cytotoxic T CL0000934 down 0.0e+00 533.1161 22486.501
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 1 CL0000939 down 0.0e+00 936.7059 36978.215
Azimuth_Cell_Types_2021 Natural Killer Cell CL0000623 down 0.0e+00 540.0310 17848.439
Azimuth_Cell_Types_2021 CD56-dim Natural Killer CL0000939 down 0.0e+00 343.1552 9284.864
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 3 CL0001050 down 0.0e+00 343.1552 9284.864
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 4 CL0001050 down 0.0e+00 343.1552 9284.864
Azimuth_Cell_Types_2021 Natural Killer T CL0000814 down 0.0e+00 274.5103 7212.098
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 2 CL0000939 down 0.0e+00 226.1591 4851.656
Azimuth_Cell_Types_2021 CD8+ Effector Memory T CL0001050 down 0.0e+00 226.1591 4851.656
Azimuth_Cell_Types_2021 Dendritic Cell CL0000451 down 0.0e+00 226.1591 4851.656
9
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
Azimuth_Cell_Types_2021 CD16+ Monocyte CL0002396 up 0 641.30476 46618.535
Azimuth_Cell_Types_2021 Nonclassical Monocyte CL0000875 up 0 974.11475 38681.024
Azimuth_Cell_Types_2021 Monocyte CL0000576 up 0 249.79279 6961.423
CellMarker_Augmented_2021 CD1C-CD141- Dendritic cell:Blood up 0 87.60321 24634.814
CellMarker_Augmented_2021 Monocyte:Fetal Kidney up 0 41.88388 10687.111
CellMarker_Augmented_2021 Macrophage:Synovium up 0 71.19231 7715.251
CellMarker_Augmented_2021 Monocyte:Lung up 0 58.05882 5323.682
CellMarker_Augmented_2021 Myeloid Dendritic cell:Thymus up 0 57.21449 5224.907
CellMarker_Augmented_2021 Dendritic cell:Spleen up 0 57.21449 5224.907
CellMarker_Augmented_2021 Monocyte:Kidney up 0 55.89514 4919.665
CellMarker_Augmented_2021 Macrophage:Retinal Pigment Epithelium up 0 55.89514 4919.665
CellMarker_Augmented_2021 Macrophage:Eye up 0 55.89514 4919.665
CellMarker_Augmented_2021 Macrophage:Lymph Node up 0 55.89514 4919.665
CellMarker_Augmented_2021 Macrophage:Visceral Adipose Tissue up 0 55.89514 4919.665
CellMarker_Augmented_2021 Macrophage:Artery up 0 55.89514 4919.665
Azimuth_Cell_Types_2021 CD8 T CL0000625 down 0 2132.78571 99042.053
Azimuth_Cell_Types_2021 CD4+ Cytotoxic T CL0000934 down 0 533.11607 22486.501
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 4 CL0001050 down 0 936.70588 36978.215
Azimuth_Cell_Types_2021 CD8+ Memory/Effector T CL0000625 down 0 936.70588 36978.215
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 1 CL0001050 down 0 1620.25581 57926.354
Azimuth_Cell_Types_2021 CD56-dim Natural Killer 1 CL0000939 down 0 540.03101 17848.439
Azimuth_Cell_Types_2021 CD8+ Effector Memory T CL0001050 down 0 540.03101 17848.439
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 3 CL0001050 down 0 540.03101 17848.439
Azimuth_Cell_Types_2021 Natural Killer Cell CL0000623 down 0 540.03101 17848.439
Azimuth_Cell_Types_2021 CD8+ T Cell CL0000625 down 0 540.03101 17848.439
CellMarker_Augmented_2021 Natural Killer T (NKT) cell:Kidney down 0 540.03101 17848.439
10
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) up 0 451.7273 20172.004
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) up 0 225.8182 9235.400
Azimuth_Cell_Types_2021 Plasmacytoid Dendritic Cell CL0000784 up 0 736.1852 41118.373
Azimuth_Cell_Types_2021 Conventional Dendritic Cell 2 2 CL0002399 up 0 198800.0000 10247002.976
Azimuth_Cell_Types_2021 Dendritic Cell CL0000451 up 0 198800.0000 10247002.976
Azimuth_Cell_Types_2021 Plasmacytoid Dendritic CL0000784 up 0 1807.1818 88825.310
Azimuth_Cell_Types_2021 Conventional Dendritic Cell 2 CL0002399 up 0 709.9286 26529.254
Azimuth_Cell_Types_2021 Myeloid Dendritic Type 2 CL0000782 up 0 410.4395 12812.697
Azimuth_Cell_Types_2021 Dendritic CL0000451 up 0 261.5263 6667.607
Azimuth_Cell_Types_2021 EREG+ Dendritic CL0000451 up 0 261.5263 6667.607
Azimuth_Cell_Types_2021 CD4+ Cytotoxic T CL0000934 down 0 1088.8525 55992.662
Azimuth_Cell_Types_2021 CD8 T CL0000625 down 0 1265.2063 52801.959
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 3 CL0001050 down 0 1265.2063 52801.959
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 4 CL0001050 down 0 1265.2063 52801.959
Azimuth_Cell_Types_2021 CD8+ Effector Memory T CL0001050 down 0 726.4688 25434.649
Azimuth_Cell_Types_2021 CD8+ Memory/Effector T CL0000625 down 0 726.4688 25434.649
Azimuth_Cell_Types_2021 CD8+ Effector Memory T 1 CL0001050 down 0 919.7077 28267.092
CellMarker_Augmented_2021 Natural Killer T (NKT) cell:Kidney down 0 726.4688 25434.649
CellMarker_Augmented_2021 T cell:Blood down 0 1189.7313 25034.706
CellMarker_Augmented_2021 T cell:Lymphoid Tissue down 0 1189.7313 25034.706
CellMarker_Augmented_2021 T cell:Thyroid down 0 1189.7313 25034.706

Differentially expressed genes

If requested, we identify genes that are differentially expressed between two groups of cells. Groups can be defined by columns in the cell metadata. Different types of tests can be used and input data for testing can be the different assays as well as the computed dimensionality reductions. Resulting p-values are adjusted using the Bonferroni method. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.

# We first compute the DEGs for all requested contrasts

# Prepare a list with contrasts (input can be R data.frame table or Excel file)
degs_contrasts_list = DegsSetupContrastsList(sc, param$deg_contrasts, param$latent_vars)

# Add the actual data to the list
degs_contrasts_list = purrr::map(degs_contrasts_list, function(contrast){
  # If there were already errors, just return
  if (length(contrast[["error_messages"]]) > 0) return(c(contrast, list(object=NULL, cells_group1_idx_subset=as.integer(), cells_group2_idx_subset=as.integer())))
  
  # Get cells indices
  cells_group1_idx = contrast[["cells_group1_idx"]]
  cells_group2_idx = contrast[["cells_group2_idx"]]

  # Create object
  if (contrast[["use_reduction"]]) {
    # Use dimensionality reduction
    contrast[["object"]] = Seurat::Reductions(sc, slot=contrast[["assay"]])
  } else {
    # Use assay
    contrast[["object"]] = Seurat::GetAssay(sc[,unique(c(cells_group1_idx, cells_group2_idx))], assay=contrast[["assay"]])
    
    # This saves a lot of memory for parallelisation
    if (contrast[["slot"]]!="scale.data") contrast[["object"]]@scale.data = new(Class="matrix")
  }
  
  # Variable latent vars must be passed as data.frame
  if (!is.null(contrast[["latent_vars"]]) && length(contrast[["latent_vars"]]) > 0) {
    contrast[["latent_vars"]] = sc[[unique(c(cells_group1_idx, cells_group2_idx)), contrast[["latent_vars"]], drop=FALSE]]
  }

  # Now update cell indices so that they match to subset
  contrast[["cells_group1_idx_subset"]] = match(colnames(sc)[cells_group1_idx], colnames(contrast[["object"]]))
  contrast[["cells_group2_idx_subset"]] = match(colnames(sc)[cells_group2_idx], colnames(contrast[["object"]]))
  
  return(contrast)
})

# Compute the tests
# TODO: this chunk may be done in parallel in future
degs_contrasts_results = purrr::map(degs_contrasts_list, function(contrast) {
  if (length(contrast$error_messages)==0) {
    # No errors, do contrast
    test_results = suppressWarnings(
      DegsTestCellSets(object=contrast[["object"]],
                       slot=contrast[["slot"]],
                       cells_1=colnames(contrast[["object"]])[contrast[["cells_group1_idx_subset"]]],
                       cells_2=colnames(contrast[["object"]])[contrast[["cells_group2_idx_subset"]]],
                       is_reduction=contrast[["use_reduction"]],
                       logfc.threshold=contrast[["log2FC"]],
                       test.use=contrast[["test"]],
                       min.pct=contrast[["min_pct"]],
                       latent.vars=contrast[["latent_vars"]])
    )
  } else {
    # Errors, return empty data.frame
    test_results = DegsEmptyResultsTable()
  }
  
  # Sort and filter table
  test_results = test_results %>% DegsSort() %>% DegsFilter(contrast[["log2FC"]], contrast[["padj"]], split_by_dir=FALSE)

  # Add mean gene expression data (counts or data, dep on slot)
  avg.1 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group1_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
  avg.2 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group2_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
  test_results = cbind(test_results, avg.1, avg.2)
  
  # Add test results and drop unneccessary data
  contrast = c(contrast, list(results=test_results))
  contrast[["object"]] = NULL
  contrast[["cells_group1_idx_subset"]] = NULL
  contrast[["cells_group2_idx_subset"]] = NULL
  
  return(contrast)
})

# Also remove objects from deg_contrasts_list (save memory)
degs_contrasts_list = purrr::map(degs_contrasts_list, function(contrast){ contrast[["object"]] = NULL; return(contrast)})
# Use the existing variable and add Enrichr results
# Not in parallel due to server load
degs_contrasts_results = purrr::map(degs_contrasts_results, function(contrast) {
  # Get results table
  results_table = contrast$results
  
  # Drop existing results
  if ("enrichr" %in% names(contrast)) contrast[["enrichr"]] = NULL
  
  # Split into up- and downregulated DEGs, then translate to Entrez gene, runEnrichr
  degs_up = dplyr::filter(results_table, avg_log2FC > 0) %>% dplyr::pull(gene) %>% unique()
  degs_up = sapply(degs_up, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
  degs_up = degs_up[!is.na(degs_up)]
  enrichr_results_up = EnrichrTest(genes=degs_up, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  
  degs_down = dplyr::filter(results_table, avg_log2FC < 0) %>% dplyr::pull(gene) %>% unique()
  degs_down = sapply(degs_down, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
  degs_down = degs_down[!is.na(degs_down)]
  enrichr_results_down = EnrichrTest(genes=degs_down, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  
  # Flatten both enrichr results into tables
  enrichr_results_up = purrr::map_dfr(names(enrichr_results_up), function(n) {
    return(cbind(enrichr_results_up[[n]], 
          list(Database=factor(rep(n, nrow(enrichr_results_up[[n]])), levels=names(enrichr_results_up)), 
               Direction=factor(rep("up", nrow(enrichr_results_up[[n]])), levels=c("up", "down"))
               )
          ))
  })
  
  enrichr_results_down = purrr::map_dfr(names(enrichr_results_down), function(n) {
    return(cbind(enrichr_results_down[[n]], 
          list(Database=factor(rep(n, nrow(enrichr_results_down[[n]])), levels=names(enrichr_results_down)), 
               Direction=factor(rep("up", nrow(enrichr_results_down[[n]])), levels=c("up", "down"))
               )
          ))
  })
  
  # Rbind and add factor levels
  enrichr_results = rbind(enrichr_results_up, enrichr_results_down)
  return(c(contrast, list(enrichr=enrichr_results)))
})
# Now regroup list so that subsets are together again
original_contrast_rows = purrr::map_int(degs_contrasts_results, function(contrast){ return(contrast[["contrast_row"]]) })
degs = split(degs_contrasts_results, original_contrast_rows)

# Write degs to files
invisible(purrr::map_chr(degs, function(degs_subsets) {
  # The output file
  file = file.path(param$path_out, "marker_degs", paste0("degs_contrast_row_", degs_subsets[[1]][["contrast_row"]], "_results.xlsx"))
  
  # Write degs
  degs_subsets_results = purrr::map(degs_subsets, function(contrast) {return(contrast[["results"]])})
  names(degs_subsets_results) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
  file = DegsWriteToFile(degs_subsets_results, 
                         annot_ensembl=annot_ensembl,
                         gene_to_ensembl=seurat_rowname_to_ensembl,
                         file=file,
                         additional_readme=NULL)
  
  return(file)
}))


invisible(purrr::map_chr(degs, function(degs_subsets) {
  # The output file
  file = file.path(param$path_out, "marker_degs", paste0("degs_contrast_row_", degs_subsets[[1]][["contrast_row"]],  "_functions.xlsx"))
  
  # Write Enrichr results
  degs_subsets_enrichr = purrr::map(degs_subsets, function(contrast) {return(contrast[["enrichr"]])})
  names(degs_subsets_enrichr) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
  file = EnrichrWriteResults(degs_subsets_enrichr, file=file)
  
  return(file)
}))

# Add to sc object
Misc(sc, "degs") = degs
for (i in seq(degs)) { 
  for (j in seq(degs[[i]])) {
    # If there are errors, skip
    if (length(degs[[i]][[j]]$error_messages) > 0) {
      next
    }

    # Average expression of all genes
    x = subset(sc, cells=degs[[i]][[j]]$cells_group2_idx) %>% GetAssayData(slot="data")
    x = log2(Matrix::rowMeans(expm1(x)) + 1)
    y = subset(sc, cells=degs[[i]][[j]]$cells_group1_idx) %>% GetAssayData(slot="data") 
    y = log2(Matrix::rowMeans(expm1(y)) + 1)
    sc_avg_log2FC = data.frame(x, y, col="none", gene=rownames(sc))
    lims = c(min(c(x, y)), max(x, y))
    
    ## Color DEGs
    up = degs[[i]][[j]]$results %>% dplyr::filter(avg_log2FC > 0) %>% dplyr::pull(gene)
    if (length(up) > 0) sc_avg_log2FC[up, "col"] = "up"
    down = degs[[i]][[j]]$results %>% dplyr::filter(avg_log2FC < 0) %>% dplyr::pull(gene)
    if (length(down) > 0) sc_avg_log2FC[down, "col"] = "down"

    ## Plots
    degs[[i]][[j]]$plot_scatter = ggplot(sc_avg_log2FC %>% dplyr::arrange(col, gene), aes(x=x, y=y, col=col)) + 
      geom_abline(slope=1, intercept=0, col="lightgrey") + 
      geom_abline(slope=1, intercept=c(-degs[[i]][[j]]$log2FC, degs[[i]][[j]]$log2FC), col="lightgrey", lty=2) + 
      geom_point() + 
      xlim(lims) + ylim(lims) +
      AddStyle(ylab=degs[[i]][[j]]$condition_group1, xlab=degs[[i]][[j]]$condition_group2, 
               col=c(none="grey", up="darkgoldenrod1", down="steelblue"), 
               legend_position="bottom", legend_title="Filtered genes")

    # Feature plot of top 4 genes, sorted by the p-value
    degs_top = degs[[i]][[j]]$results %>% dplyr::top_n(n=-4, wt=p_val) %>% dplyr::top_n(n=-4, wt=avg_log2FC) %>% dplyr::pull(gene)
    if (length(degs_top) > 0) {
      p_list = Seurat::FeaturePlot(sc, features=degs_top, cols=c("lightgrey", param$col), combine=FALSE, label=FALSE)
      for (p in seq(p_list)) p_list[[p]] = p_list[[p]] + AddStyle(legend_position="bottom", xlab="", ylab="")
      degs[[i]][[j]]$plot_feature = patchwork::wrap_plots(p_list, ncol=ifelse(length(degs_top) > 1, 2, 1))
    } else {
      degs[[i]][[j]]$plot_feature = NULL
    }
  }
}
knitr_header_string = "

## {{condition_column}}: {{condition_group1}} vs {{condition_group2}}

General configuration:

* assay: {{assay}}
* slot: {{slot}}
* test: {{test}}
* maximum adjusted p-value: {{padj}}
* minimum absolute log2 foldchange: {{log2FC}}
* minimum percentage of cells: {{min_pct}}
* latent vars: {{latent_vars}}
* output files: degs_contrast_row_{{row}}_results.xlsx, degs_contrast_row_{{row}}_functions.xlsx

Subset on column: \'{{subset_column}}\'"

if (length(degs)==0) message("No DEG contrasts specified.")

for (i in seq(degs)) {
  # Get subsets
  degs_subsets = degs[[i]]
  first_contrast = degs_subsets[[1]]
  
  # Create header
  cat(
    knitr::knit_expand(text=knitr_header_string,
                      row=i,
                     condition_column=first_contrast[["condition_column"]],
                     condition_group1=first_contrast[["condition_group1"]],
                     condition_group2=first_contrast[["condition_group2"]],
                     assay=first_contrast[["assay"]],
                     slot=first_contrast[["slot"]],
                     test=first_contrast[["test"]],
                     padj=first_contrast[["padj"]],
                     log2FC=first_contrast[["log2FC"]],
                     min_pct=first_contrast[["min_pct"]],
                     latent_vars=ifelse(!is.null(first_contrast[["latent_vars"]]), paste(colnames(first_contrast[["latent_vars"]]), collapse=","), "-"),
                     subset_column=ifelse(is.na(first_contrast[["subset_column"]]), "-", first_contrast[["subset_column"]]))
  , "\n")
  
  # Get error messages
  error_messages = unique(purrr::flatten_chr(purrr::map(degs_subsets, function(contrast){return(contrast[["error_messages"]])})))

   # Create combined results table
  degs_subsets_results = purrr::map_dfr(degs_subsets, function(contrast) {
    subset_group_value = ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All")
    return(contrast[["results"]] %>% 
      dplyr::summarise(subset_group=subset_group_value,
                       Cells1=length(contrast[["cells_group1_idx"]]),
                       Cells2=length(contrast[["cells_group2_idx"]]),
                       DEGs=length(gene),
                       DEGs_up=sum(avg_log2FC > 0),
                       DEGs_down=sum(avg_log2FC < 0)))
  }) %>% tibble::as_tibble()
  
  # Print warnings/errors
  if (length(error_messages) > 0) {
    warning(error_messages)
  }
  
  # Print summary table
  print(
      knitr::kable(degs_subsets_results,
                   align="l", caption="DEG summary", 
                   col.names=c("Subset", "Cells in group 1", "Cells in group 2", "# DEGs", "# DEGs upregulated", "# DEGs downregulated"), 
                   format="html") %>%
        kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
    )
  
  # Print plots per contrast
  for (contrast in seq(degs_subsets)) {
    if (!"plot_scatter" %in% names(degs_subsets[[contrast]]) & !"plot_feature" %in% names(degs_subsets[[contrast]])) next
      
    p = degs_subsets[[contrast]]$plot_scatter | degs_subsets[[contrast]]$plot_feature
    title = "Scatterplot and feature plots"
    if (!is.na(degs_subsets[[contrast]]$subset_column)) {
      title = paste0(title, " (subset ", degs_subsets[[contrast]]$subset_column, 
                     ": ", degs_subsets[[contrast]]$subset_group, ")")
    }
    
    p = p + patchwork::plot_annotation(title=title)
    print(p)
  }
  
  cat("\n \n")
} 

orig.ident: pbmc_10x vs pbmc_smartseq2_sample1

General configuration:

  • assay: RNA
  • slot: data
  • test: wilcox
  • maximum adjusted p-value: 0.05
  • minimum absolute log2 foldchange: 0
  • minimum percentage of cells: 0.1
  • latent vars: -
  • output files: degs_contrast_row_1_results.xlsx, degs_contrast_row_1_functions.xlsx
Subset on column: ‘-’
DEG summary
Subset Cells in group 1 Cells in group 2 # DEGs # DEGs upregulated # DEGs downregulated
All 4033 311 3667 1097 2570

orig.ident: pbmc_10x vs pbmc_smartseq2_sample1

General configuration:

  • assay: RNA
  • slot: data
  • test: wilcox
  • maximum adjusted p-value: 0.05
  • minimum absolute log2 foldchange: 0
  • minimum percentage of cells: 0.1
  • latent vars: -
  • output files: degs_contrast_row_2_results.xlsx, degs_contrast_row_2_functions.xlsx
Subset on column: ‘seurat_clusters’
DEG summary
Subset Cells in group 1 Cells in group 2 # DEGs # DEGs upregulated # DEGs downregulated
1 50 50 321 133 188
2 50 47 305 120 185
3 50 31 282 125 157
4 50 43 463 133 330
5 50 24 245 92 153
6 50 34 297 122 175
7 50 17 55 7 48
8 50 20 225 90 135
9 50 8 127 3 124
10 40 6 73 0 73

Phase: G1 vs G2M

General configuration:

  • assay: RNA
  • slot: data
  • test: wilcox
  • maximum adjusted p-value: 0.05
  • minimum absolute log2 foldchange: 0
  • minimum percentage of cells: 0.1
  • latent vars: -
  • output files: degs_contrast_row_3_results.xlsx, degs_contrast_row_3_functions.xlsx
Subset on column: ‘seurat_clusters’
DEG summary
Subset Cells in group 1 Cells in group 2 # DEGs # DEGs upregulated # DEGs downregulated
1 30 30 0 0 0
2 30 30 0 0 0

Output

# Add colour lists for orig.dataset
col = GenerateColours(num_colours=length(levels(sc$orig.dataset)), names=levels(sc$orig.dataset), palette=param$col_palette_samples, alphas=1)
sc = ScAddLists(sc, lists=list(orig.dataset=col), lists_slot="colour_lists")

# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))

# Add parameter
Seurat::Misc(sc, "parameters") = param

# Add technical output (note: cannot use Misc function here)
sc@misc$technical = data.frame(ScrnaseqSessionInfo(param$path_to_git))

Export to Loupe Cell Browser

If all provided datasets are of type “10x”, we export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser https://support.10xgenomics.com/single-cell-gene-expression/software/visualization/latest/what-is-loupe-cell-browser.

if (all(param$path_data$type == "10x")) { 
  
  # Export reductions (umap, pca, others)
  for(r in Seurat::Reductions(sc)) {
    write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
                file=file.path(param$path_out, "export", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
  }
  
  # Export categorical metadata
  loupe_meta = sc@meta.data
  idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
  write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"), 
              file=file.path(param$path_out, "export", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

  # Export gene sets (CC genes, known markers, per-cluster markers up- and downregulated, ...)
  gene_lists = Misc(sc, "gene_lists")
  
  # Remove empty gene sets
  gene_lists = gene_lists[purrr::map_int(gene_lists, length) > 0]
  loupe_genesets = purrr::map_dfr(names(gene_lists), function(n) {return(data.frame(List=n, Name=gene_lists[[n]]))})
  loupe_genesets$Ensembl = seurat_rowname_to_ensembl[loupe_genesets$Name]
  write.table(loupe_genesets, file=file.path(param$path_out, "export", "Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}

Result files are:

  • export/
    • Loupe_projection_(umap|pca|…).csv: Seurat UMAP/PCA/… projections for visualization
    • Loupe_metadata.csv: Seurat cell meta data including clusters and cell cycle phases
    • Loupe_genesets.csv: Gene sets such as markers, DEGs, cell cycles

Projections can be imported in Loupe via “Import Projection”, cell meta data via “Import Categories” and gene sets via “Import Lists”

Export to cellxgene browser

We export the assay data, cell metadata, clustering and visualisation in a format that can be read by the cellxgene browser https://github.com/chanzuckerberg/cellxgene.

# Convert Seurat single cell object to python anndata object which will be accessible via reticulate here
adata = sceasy::convertFormat(sc, from="seurat", to="anndata", outFile=NULL, assay=DefaultAssay(sc))

# Set up correct colours (see https://chanzuckerberg.github.io/cellxgene/posts/prepare) so that colours match
adata$uns = dict(
  seurat_clusters_colors=np_array(unname(Misc(sc, "colour_lists")$seurat_clusters), dtype="<U7"), 
  orig.ident_colors=np_array(unname(Misc(sc, "colour_lists")$orig.ident), dtype="<U7"),
  orig.dataset_colors=np_array(unname(Misc(sc, "colour_lists")$orig.dataset), dtype="<U7"),
  Phase_colors=np_array(unname(Misc(sc, "colour_lists")$Phase), dtype="<U7")
)

# Write to h5ad file
adata$write(file.path(param$path_out, "export", "sc.h5ad"), compression="gzip")

Result files are:

  • export/
    • sc.h5ad: H5AD object for cellxgene browser

Copy/upload to data directory of your cellxgene browser

Export to Cerebro browser

We export the assay data, clustering, visualisation, marker genes, enriched pathways and degs in a format that can be read by the Cerebro Browser https://github.com/romanhaa/cerebroApp/.

# Export to cerebro
res = ExportToCerebro(sc=sc, 
                      path=file.path(param$path_out, "export", "sc.crb"), 
                      assay=DefaultAssay(sc),
                      assay_raw=param$assay_raw,
                      delayed_array=FALSE)

Result files are:

  • export/
    • sc.crb: Object for Cerebro browser

Load into Cerebro browser.

Other output files

# Seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))

# Counts (RNA)
invisible(ExportSeuratAssayData(sc, 
                      dir=file.path(param$path_out, "data", "counts"), 
                      assays=param$assay_raw, 
                      slot="counts",
                      include_cell_metadata_cols=colnames(sc[[]]),
                      metadata_prefix=paste0(param$project_id, ".")))

# Metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), rowNames=FALSE, colNames=TRUE)

# Annotation as excel file
openxlsx::write.xlsx(x=data.frame(seurat_id=rownames(sc), ensembl_gene_id=seurat_rowname_to_ensembl[rownames(sc)], row.names=rownames(sc)) %>%
                       dplyr::inner_join(annot_ensembl, by="ensembl_gene_id"),
                     file=file.path(param$path_out, "annotation", "gene_annotation.xlsx"), 
                     rowNames=FALSE, colNames=TRUE)

# Data and annotation for a subset of 500 cells for Morpheus
# ERROR: This gives an error with the newest packages, might be related to this: 
# https://issueexplorer.com/issue/r-lib/cpp11/244
#  For now, we outcomment this section, since it isn't really needed
#cells_subset = ScSampleCells(sc, n=500, seed=1)
#openxlsx::write.xlsx(Seurat::GetAssayData(sc[, cells_subset], slot="data") %>% as.data.frame() %>% tibble::rownames_to_column(var="gene"), 
#                     file=file.path(param$path_out, "data", paste0("subset_", 500, "_cells_normalised_data.xlsx")),
#                     rowNames=FALSE, colNames=TRUE)
#openxlsx::write.xlsx(Seurat::GetAssayData(sc[, cells_subset], slot="scale.data") %>% as.data.frame() %>% #tibble::rownames_to_column(var="gene"), 
#                     file=file.path(param$path_out, "data", paste0("subset_", 500, "_cells_scaled_data.xlsx")),
#                     rowNames=FALSE, colNames=TRUE)
#openxlsx::write.xlsx(sc[[]][cells_subset, ] %>% as.data.frame() %>% tibble::rownames_to_column(var="cell"), 
#           file=file.path(param$path_out, "data",paste0("subset_", 500, "_cells_column_annotations.xlsx")),
#           rowNames=FALSE, colNames=TRUE)

Result files are:

  • annotation/
    • gene_annotation.xlsx: Ensembl annotation of the genes (Excel format)
    • cell_cycle_markers.xlsx: Markers for cell cycle phases (Excel format)
    • species_gene_ensembl.vEnsembl.annot.txt: Raw annotation downloaded from Ensembl (tab-separated file)
  • data/
    • sc.rds: Seurat object for import into R
    • cell_metadata.xlsx: Cell metadata (Excel format)
    • counts: Raw counts of the entire dataset (sparse matrix)
    • subset_500_cells_normalised_data.xlsx: Normalised subset of 500 cells for heatmap plotting with morpheus (https://software.broadinstitute.org/morpheus/) (Excel format)
    • subset_500_cells_scaled_data.xlsx: Normalised and scaled subset of 500 cells for heatmap plotting with morpheus (Excel format)
    • subset_500_cells_column_annotations.xlsx: Cell annotations for the 500 cells subset (Excel format)
  • figures/
    • all plots of the HTML report (PNG and PDF format)
  • marker_degs/
    • markers_cluster_vs_rest.xslx: Marker genes for each cluster (Excel format)
    • functions_marker_up_cluster_[1,2,3,…]_vs_rest.xslx: Biological functions and pathways that were found enriched in the up-regulated markers for cluster 1, 2, 3, … (Excel format)
    • functions_marker_down_cluster_[1,2,3,…]_vs_rest.xslx: Biological functions and pathways that were found enriched in the down-regulated markers for cluster 1, 2, 3, … (Excel format)
    • degs_contrast_row_[1,2,3,…]_results.xlsx: Results (genes) of the DEG analysis specified in the first, second, third, … row of the configuration parameter (Excel format)
    • degs_contrast_row_[1,2,3,…]_functions.xlsx: Results (enriched functions and pathways) of the DEG analysis specified in the first, second, third, … row of the configuration parameter (Excel format)

Parameter table

The following parameters were used to run the workflow.

out = ScrnaseqParamsInfo(params=param)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")
Name Value
author kasa397b
project_id pbmc
path_data name:pbmc_10x, pbmc_smartseq2; type:10x, smartseq2; path:test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/, test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz; stats:NA, NA; suffix:-1, -2
assay_raw RNA
downsample_cells_n
downsample_cells_equally FALSE
path_out test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/
file_known_markers test_datasets/10x_SmartSeq2_pbmc_GSE132044/known_markers.xlsx
file_contam_cells
mart_dataset hsapiens_gene_ensembl
annot_version 98
annot_main ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession
mart_attributes ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description
biomart_mirror www
mt ^MT-
doublets
cell_filter pbmc_10x:nFeature_RNA=c(NA, NA), percent_mt=c(NA, NA); pbmc_smartseq2_sample1:nFeature_RNA=c(NA, NA), percent_mt=c(NA, NA)
feature_filter pbmc_10x:min_counts=1, min_cells=3; pbmc_smartseq2_sample1:min_counts=1, min_cells=3
samples_min_cells 10
norm RNA
cc_remove FALSE
cc_remove_all FALSE
cc_rescore_after_merge TRUE
integrate_samples method:integrate; dimensions:30; reference:; use_reciprocal_pca:FALSE
pc_n 10
cluster_k 20
umap_k 30
cluster_resolution_test 0.3, 0.7
cluster_resolution 0.5
marker_padj 0.05
marker_log2FC 1
marker_pct 0.25
deg_contrasts condition_column:orig.ident, orig.ident, Phase; condition_group1:pbmc_10x, pbmc_10x, G1; condition_group2:pbmc_smartseq2_sample1, pbmc_smartseq2_sample1, G2M; subset_column:NA, seurat_clusters, seurat_clusters; subset_group:NA, , 1;2; downsample_cells_n:NA, 50, 30
enrichr_site Enrichr
enrichr_padj 0.05
enrichr_dbs GO_Molecular_Function_2018, GO_Biological_Process_2018, GO_Cellular_Component_2018, Azimuth_Cell_Types_2021, CellMarker_Augmented_2021, Descartes_Cell_Types_and_Tissue_2021
col darkblue
col_palette_samples ggsci::pal_jama
col_palette_clusters ggsci::pal_igv
path_to_git .
debugging_mode default_debugging
cores 8
file_annot test_datasets/10x_SmartSeq2_pbmc_GSE132044/results//annotation/hsapiens_gene_ensembl.v98.annot.txt
file_cc_genes test_datasets/10x_SmartSeq2_pbmc_GSE132044/results//annotation/cell_cycle_markers.xlsx
samples_to_drop
col_samples pbmc_10x=#374E55FF, pbmc_smartseq2_sample1=#DF8F44FF
latent_vars orig.ident
col_clusters 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF, 5=#466983FF, 6=#BA6338FF, 7=#5DB1DDFF, 8=#802268FF, 9=#6BD76BFF, 10=#D595A7FF

Software versions

This report was created with generated using the scrnaseq GitHub repository. Software versions were collected at run time.

out = ScrnaseqSessionInfo(param$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Name Value
Run on: Tue Aug 22 10:39:15 2023
ktrns/scrnaseq 452b82bf37b3c6e4b7f34f4154915474fb982c85
Container singularity-single-cell, 578635e0, 2023-01-13T14:32:05+01:00
R R version 4.2.2 (2022-10-31)
Platform x86_64-conda-linux-gnu (64-bit)
Operating system Debian GNU/Linux 11 (bullseye)
Host name cbioinfws1
Host OS #83~20.04.1-Ubuntu SMP Wed Jun 21 20:23:31 UTC 2023 (5.15.0-76-generic)
Packages abind1.4-5, annotate1.76.0, AnnotationDbi1.60.0, ape5.6-2, assertthat0.2.1, babelgene22.9, backports1.4.1, beachmat2.14.0, beeswarm0.4.0, bibtex0.5.0, Biobase2.58.0, BiocFileCache2.6.0, BiocGenerics0.44.0, BiocParallel1.32.5, BiocSingular1.14.0, biomaRt2.54.0, Biostrings2.66.0, bit4.0.5, bit644.0.5, bitops1.0-7, blob1.2.3, bslib0.4.2, cachem1.0.6, cerebroApp1.3.1, cli3.6.0, cluster2.1.4, codetools0.2-18, colorspace2.0-3, colourpicker1.2.0, cowplot1.1.1, crayon1.5.2, curl5.0.0, data.table1.14.6, DBI1.1.3, dbplyr2.2.1, DelayedArray0.24.0, DelayedMatrixStats1.20.0, deldir1.0-6, digest0.6.31, dplyr1.0.10, DT0.26.3, ellipsis0.3.2, enrichR3.1, evaluate0.19, fansi1.0.3, farver2.1.1, fastmap1.1.0, filelock1.0.2, fitdistrplus1.1-8, fs1.5.2, future1.30.0, future.apply1.10.0, generics0.1.3, GenomeInfoDb1.34.6, GenomeInfoDbData1.2.9, GenomicRanges1.50.2, ggbeeswarm0.7.1, ggplot23.4.0, ggrastr1.0.1, ggrepel0.9.2, ggridges0.5.4, ggsci2.9, globals0.16.2, glue1.6.2, goftest1.2-3, graph1.76.0, gridExtra2.3, GSEABase1.60.0, GSVA1.46.0, gtable0.3.1, HDF5Array1.26.0, highr0.10, hms1.1.2, htmltools0.5.4, htmlwidgets1.6.1, httpuv1.6.8, httr1.4.4, ica1.0-3, igraph1.3.5, IRanges2.32.0, irlba2.3.5.1, jquerylib0.1.4, jsonlite1.8.4, kableExtra1.3.4, KEGGREST1.38.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.41, labeling0.4.2, later1.3.0, lattice0.20-45, lazyeval0.2.2, leiden0.4.3, lifecycle1.0.3, limma3.54.0, listenv0.9.0, lmtest0.9-40, lubridate1.9.0, magrittr2.0.3, MASS7.3-58.1, MAST1.24.0, Matrix1.5-3, MatrixGenerics1.10.0, matrixStats0.63.0, memoise2.0.1, mime0.12, miniUI0.1.1.1, msigdbr7.5.1, munsell0.5.0, nlme3.1-161, openxlsx4.2.5.1, parallelly1.33.0, patchwork1.1.2, pbapply1.6-0, pillar1.8.1, pkgconfig2.0.3, plotly4.10.1, plyr1.8.8, png0.1-8, polyclip1.10-4, prettyunits1.1.1, progress1.2.2, progressr0.13.0, promises1.2.0.1, purrr1.0.1, qvalue2.30.0, R.methodsS31.8.2, R.oo1.25.0, R.utils2.12.2, R62.5.1, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-3, Rcpp1.0.9, RcppAnnoy0.0.20, RCurl1.98-1.9, readr2.1.3, RefManageR1.4.0, reshape21.4.4, reticulate1.27, rhdf52.42.0, rhdf5filters1.10.0, Rhdf5lib1.20.0, rjson0.2.21, rlang1.0.6, rmarkdown2.19, ROCR1.0-11, RSQLite2.2.20, rstudioapi0.14, rsvd1.0.5, Rtsne0.16, rvest1.0.3, S4Vectors0.36.1, sass0.4.4, ScaledMatrix1.6.0, scales1.2.1, scattermore0.8, sceasy0.0.7, sctransform0.3.5, sessioninfo1.2.2, Seurat4.3.0, SeuratObject4.1.3, shiny1.7.4, shinycssloaders1.0.0, shinydashboard0.7.2, shinyFiles0.9.3, shinyjs2.1.0, shinyWidgets0.7.6, SingleCellExperiment1.20.0, sp1.5-1, sparseMatrixStats1.10.0, spatstat.data3.0-0, spatstat.explore3.0-5, spatstat.geom3.0-3, spatstat.random3.0-1, spatstat.sparse3.0-0, spatstat.utils3.0-1, stringi1.7.12, stringr1.5.0, SummarizedExperiment1.28.0, survival3.5-0, svglite2.1.1, systemfonts1.0.4, tensor1.5, tibble3.1.8, tidyr1.2.1, tidyselect1.2.0, timechange0.2.0, tzdb0.3.0, utf81.2.2, uwot0.1.14, vctrs0.5.1, vipor0.4.5, viridis0.6.2, viridisLite0.4.1, vroom1.6.0, webshot0.5.4, withr2.5.0, xfun0.36, XML3.99-0.13, xml21.3.3, xtable1.8-4, XVector0.38.0, yaml2.3.6, zip2.2.2, zlibbioc1.44.0, zoo1.8-11

Credits and References

This Workflow was developed as part of the scrnaseq GitHub repository by Katrin Sameith and Andreas Petzold at the Dresden-concept Genome Center, TU Dresden (Dresden, Germany), in collaboration with Torsten Glomb, Maike Kosanke and Oliver Dittrich at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates.

# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))
Gandolfo, Luke C., and Terence P. Speed. 2018. RLE Plots: Visualizing Unwanted Variation in High Dimensional Data.” Edited by Enrique Hernandez-Lemus. PLOS ONE 13 (2): e0191629. https://doi.org/10.1371/journal.pone.0191629.
Hafemeister, Christoph, and Rahul Satija. 2019. “Normalization and Variance Stabilization of Single-Cell RNA-Seq Data Using Regularized Negative Binomial Regression.” Genome Biology 20 (1). https://doi.org/10.1186/s13059-019-1874-1.
Tirosh, Itay, Benjamin Izar, Sanjay M. Prakadan, Marc H. Wadsworth, Daniel Treacy, John J. Trombetta, Asaf Rotem, et al. 2016. “Dissecting the Multicellular Ecosystem of Metastatic Melanoma by Single-Cell RNA-Seq.” Science 352 (6282): 189–96. https://doi.org/10.1126/science.aad0501.
Traag, V. A., L. Waltman, and N. J. van Eck. 2019. “From Louvain to Leiden: Guaranteeing Well-Connected Communities.” Scientific Reports 9 (1). https://doi.org/10.1038/s41598-019-41695-z.